備忘録

Author

佐野孔亮

バイオインフォマ関係の覚書

基本的な事項

ssh gw.ddbj.nig.ac.jp
#スパコンへのログイン。この後にqloginを実行して作業ノードへ移る

scp ~/Desctop/hoge kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/bio/
#ローカルの~/Desctop/にあるhogeというファイルを遺伝研スパコンの/home/kosukesano/bio/にコピーする。ローカルで実行する。ディレクトリをコピーする場合はscp -rとする。

scp kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/bio/hoge ~/Desctop/
#遺伝研スパコンの/home/kosukesano/bio/にあるhogeというファイルをローカルの~/Desctop/にコピーする。**ローカルで実行する。**

source ~/tools/pyenv_env/braker_profile
# braker環境を立ち上げる際、初めに行う。pyenvとcondaにパスを通し、conda環境に入った後にbraker環境に入る。

source ~/tools/pyenv_env/EDTA_profile
# EDTA環境を立ち上げる際、初めに行う。pyenvとconda、mambaにパスを通し、mambaforge環境に入った後にEDTA環境に入る。

source ~/pyenv_conda_environment/.pyenv_profile
# pyenvを実行する際、初めに行う。パスを通す。

source ~/tools/pyenv_env/ETE_profile 
# ETEを使う際、初めに行う。ETE用の環境に入る。



ssh scorpion
#牧野研スコーピオンサーバーへのログイン。

2024年4月

0430

遺伝研スパコンのホームディレクトリの中身を全てHDDに移した。場所は /Volumes/Elements/240430_ddbj_backup/kosukesano$

# 実行したコード
$ scp -r kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/ /Volumes/Elements/240430_ddbj_backup

2024年5月

0501

遺伝研スパコンのAnaconda3miniforgeを削除した。またホームディレクトリにあったファイルも(ディレクトリ以外は)削除した。

anacondaのアンインストール参考

# Anacondaのアンインストール
$ conda install anaconda-clean
$ anaconda-clean
$ rm -fr /anaconda3

miniforgeのアンインストール参考

# miniforgeのアンインストール
$ rm -rf ~/.conda
# ~/.local/bin/ になぜかmambaがいたので、
$ rm mamba

遺伝研の環境初期化ページに倣い、.bashrc.bash_profileを書き直した。遺伝研初期化ページ参考

変更前の.bash_profile

# .bash_profile

# Get the aliases and functions
#
if [ -f ~/.bashrc ]; then
        . ~/.bashrc
fi

##############################

source ~/.bashrc

#############################


# User specific environment and startup programs

PATH=$PATH:$HOME/.local/bin:$HOME/bin

export PATH


export GENEMARK_PATH=/home/kosukesano/local/gmes_linux_64_4
export PROTHINT_PATH=/home/kosukesano/local/gmes_linux_64_4/ProtHint/bin
export ALIGNMENT_TOOL_PATH=/home/kosukesano/local/spaln-master
export CDBTOOLS_PATH=/home/kosukesano/local/cdbfasta-master

#source ~/.bash_profile

変更後の.bash_profile

# .bash_profile

# Get the aliases and functions
if [ -f ~/.bashrc ]; then
    . ~/.bashrc
fi

# User specific environment and startup programs

PATH=$PATH:$HOME/.local/bin:$HOME/bin

export PATH

変更前の.bashrc

# .bashrc

# Source global definitions
if [ -f /etc/bashrc ]; then
        . /etc/bashrc
fi

##########################################

# If this variable is already set, skip the rest of the script
if [ -n "$BASHRC_LOADED" ]; then
    return
fi

# Set the variable to indicate that the script has been loaded
BASHRC_LOADED=1

# ---

# ~/.bashrc: executed by bash(1) for non-login shells.
# see /usr/share/doc/bash/examples/startup-files (in the package bash-doc)
# for examples

# If not running interactively, don't do anything
case $- in
    *i*) ;;
      *) return;;
esac

# don't put duplicate lines or lines starting with space in the history.
# See bash(1) for more options
HISTCONTROL=ignoreboth

# append to the history file, don't overwrite it
shopt -s histappend

# for setting history length see HISTSIZE and HISTFILESIZE in bash(1)
HISTSIZE=1000
HISTFILESIZE=2000

# check the window size after each command and, if necessary,
# update the values of LINES and COLUMNS.
shopt -s checkwinsize

# If set, the pattern "**" used in a pathname expansion context will
# match all files and zero or more directories and subdirectories.
#shopt -s globstar

# make less more friendly for non-text input files, see lesspipe(1)
[ -x /usr/bin/lesspipe ] && eval "$(SHELL=/bin/sh lesspipe)"

# set variable identifying the chroot you work in (used in the prompt below)
if [ -z "${debian_chroot:-}" ] && [ -r /etc/debian_chroot ]; then
    debian_chroot=$(cat /etc/debian_chroot)
fi

# set a fancy prompt (non-color, unless we know we "want" color)
case "$TERM" in
    xterm-color|*-256color) color_prompt=yes;;
esac

# uncomment for a colored prompt, if the terminal has the capability; turned
# off by default to not distract the user: the focus in a terminal window
# should be on the output of commands, not on the prompt
#force_color_prompt=yes

if [ -n "$force_color_prompt" ]; then
    if [ -x /usr/bin/tput ] && tput setaf 1 >&/dev/null; then
    # We have color support; assume it's compliant with Ecma-48
    # (ISO/IEC-6429). (Lack of such support is extremely rare, and such
    # a case would tend to support setf rather than setaf.)
    color_prompt=yes
    else
    color_prompt=
    fi
fi

if [ "$color_prompt" = yes ]; then
    PS1='${debian_chroot:+($debian_chroot)}\[\033[01;32m\]\u@\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ '
else
    PS1='${debian_chroot:+($debian_chroot)}\u@\h:\w\$ '
fi
unset color_prompt force_color_prompt

# If this is an xterm set the title to user@host:dir
case "$TERM" in
xterm*|rxvt*)
    PS1="\[\e]0;${debian_chroot:+($debian_chroot)}\u@\h: \w\a\]$PS1"
    ;;
*)
    ;;
esac

# enable color support of ls and also add handy aliases
if [ -x /usr/bin/dircolors ]; then
    test -r ~/.dircolors && eval "$(dircolors -b ~/.dircolors)" || eval "$(dircolors -b)"
    alias ls='ls --color=auto'
    #alias dir='dir --color=auto'
    #alias vdir='vdir --color=auto'

    alias grep='grep --color=auto'
    alias fgrep='fgrep --color=auto'
    alias egrep='egrep --color=auto'
fi

# colored GCC warnings and errors
#export GCC_COLORS='error=01;31:warning=01;35:note=01;36:caret=01;32:locus=01:quote=01'

# some more ls aliases
alias ll='ls -alF'
alias la='ls -A'
alias l='ls -CF'

# Add an "alert" alias for long running commands.  Use like so:
#   sleep 10; alert
alias alert='notify-send --urgency=low -i "$([ $? = 0 ] && echo terminal || echo error)" "$(history|tail -n1|sed -e '\''s/^\s*[0-9]\+\s*//;s/[;&|]\s*alert$//'\'')"'

# Alias definitions.
# You may want to put all your additions into a separate file like
# ~/.bash_aliases, instead of adding them here directly.
# See /usr/share/doc/bash-doc/examples in the bash-doc package.

if [ -f ~/.bash_aliases ]; then
    . ~/.bash_aliases
fi

# enable programmable completion features (you don't need to enable
# this, if it's already enabled in /etc/bash.bashrc and /etc/profile
# sources /etc/bash.bashrc).
if ! shopt -oq posix; then
  if [ -f /usr/share/bash-completion/bash_completion ]; then
    . /usr/share/bash-completion/bash_completion
  elif [ -f /etc/bash_completion ]; then
    . /etc/bash_completion
  fi
fi





#########################################

# Uncomment the following line if you don't like systemctl's auto-paging feature:
# export SYSTEMD_PAGER=

# User specific aliases and functions
#module load gcc

# <<< conda initialize <<<

# enable color support of ls and also add handy aliases
if [ -x /usr/bin/dircolors ]; then
    test -r ~/.dircolors && eval "$(dircolors -b ~/.dircolors)" || eval "$(dircolors -b)"
    alias ls='ls --color=auto'
    #alias dir='dir --color=auto'
    #alias vdir='vdir --color=auto'

    alias grep='grep --color=auto'
    alias fgrep='fgrep --color=auto'
    alias egrep='egrep --color=auto'
fi

変更後の.bashrc

# .bashrc

# Source global definitions
if [ -f /etc/bashrc ]; then
    . /etc/bashrc
fi

# Uncomment the following line if you don't like systemctl's auto-paging feature:
# export SYSTEMD_PAGER=

# User specific aliases and functions
module load gcc

元の.bashrcにあった記述のうち、色に関わる内容をもう一度記載。

.bashrcへの加筆内容


###ここから下は主に書き加えた部分###

####################################################################################
# set a fancy prompt (non-color, unless we know we "want" color)
case "$TERM" in
    xterm-color|*-256color) color_prompt=yes;;
esac
####################################################################################
#↑により、カラー対応の端末で Bash を実行している場合にのみ、色付きのプロンプトが表示される



###################################################################################
# uncomment for a colored prompt, if the terminal has the capability; turned
# off by default to not distract the user: the focus in a terminal window
# should be on the output of commands, not on the prompt
#force_color_prompt=yes

if [ -n "$force_color_prompt" ]; then
    if [ -x /usr/bin/tput ] && tput setaf 1 >&/dev/null; then
    # We have color support; assume it's compliant with Ecma-48
    # (ISO/IEC-6429). (Lack of such support is extremely rare, and such
    # a case would tend to support setf rather than setaf.)
    color_prompt=yes
    else
    color_prompt=
    fi
fi

if [ "$color_prompt" = yes ]; then
    PS1='${debian_chroot:+($debian_chroot)}\[\033[01;32m\]\u@\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ '
else
    PS1='${debian_chroot:+($debian_chroot)}\u@\h:\w\$ '
fi
unset color_prompt force_color_prompt

# If this is an xterm set the title to user@host:dir
case "$TERM" in
xterm*|rxvt*)
    PS1="\[\e]0;${debian_chroot:+($debian_chroot)}\u@\h: \w\a\]$PS1"
    ;;
*)
    ;;
esac
####################################################################################
#↑ターミナルプロンプトの色や表示を設定する



#####################################################################################
# enable color support of ls and also add handy aliases
if [ -x /usr/bin/dircolors ]; then
    test -r ~/.dircolors && eval "$(dircolors -b ~/.dircolors)" || eval "$(dircolors -b)"
    alias ls='ls --color=auto'
    #alias dir='dir --color=auto'
    #alias vdir='vdir --color=auto'

    alias grep='grep --color=auto'
    alias fgrep='fgrep --color=auto'
    alias egrep='egrep --color=auto'
fi
#######################################################################################
#↑ls および grep コマンドに色のサポートをつける

また、ログイン時にGCCに関わる部分で以下のエラーが発生した。

[kosukesano@gwB1 ~]$ qlogin
Your job 25915671 ("QLOGIN") has been submitted
waiting for interactive job to be scheduled ...
Your interactive job 25915671 has been successfully scheduled.
Establishing /home/geadmin/AGER/utilbin/lx-amd64/qlogin_wrapper session to host at137 ...
Warning: Permanently added '[at137]:41669,[172.19.7.185]:41669' (ECDSA) to the list of known hosts.
Welcome to Ubuntu 22.04.3 LTS (GNU/Linux 5.15.0-87-generic x86_64)

 * Documentation:  https://help.ubuntu.com
 * Management:     https://landscape.canonical.com
 * Support:        https://ubuntu.com/advantage

  System information as of Wed May  1 16:06:51 JST 2024

  System load:  3.0078125           Users logged in:           34
  Usage of /:   32.8% of 823.03GB   IPv4 address for eno1:     172.19.18.185
  Memory usage: 34%                 IPv4 address for eno2:     192.168.50.187
  Swap usage:   27%                 IPv4 address for ibp161s0: 172.19.7.185
  Processes:    2133

  => There is 1 zombie process.

 * Strictly confined Kubernetes makes edge and IoT secure. Learn how MicroK8s
   just raised the bar for easy, resilient and secure K8s cluster deployment.

   https://ubuntu.com/engage/secure-kubernetes-at-the-edge

Expanded Security Maintenance for Applications is not enabled.

11 updates can be applied immediately.
To see these additional updates run: apt list --upgradable

56 additional security updates can be applied with ESM Apps.
Learn more about enabling ESM Apps service at https://ubuntu.com/esm


Last login: Tue Apr 30 10:14:26 2024 from 172.19.7.250
ERROR: Unable to locate a modulefile for 'gcc'

かつての.bashrcを見るとmodule load gccという記述がコメントアウトされていた。暫定的に今回もコメントアウトしておく。

# User specific aliases and functions
#module load gcc
#↑この部分でなんかエラーが出たんだけど、昔のbashrcはコメントアウトしちゃってたので同様の処置をとった。

0502

pyenvのインストール

kosukesano@at139:~$ git clone git://github.com/yyuu/pyenv.git ~/.pyenv
# pyenvをgitでインストール

.bash_profileなどにパスを書くと何かミスがあった場合重大なことになるため、別のプロファイルを作ってそこにパスを書く

kosukesano@at139:~$ mkdir pyenv_conda_environment
# pyenv_conda_environmentというディレクトリをホーム直下に作成
kosukesano@at139:~$ cd pyenv_conda_environment/
kosukesano@at139:~/pyenv_conda_environment$ nano .pyenv_profile
# .pyenv_profileというファイルを作成

.pyenv_profileの中身

export PYENV_ROOT="$HOME/.pyenv"
export PATH="$PYENV_ROOT/bin:$PATH"
eval "$(pyenv init -)"

この後、pyenvを打ってもCommand not foundと出てしまうが、source ~/pyenv_conda_environment/.pyenv_profileで先ほどのプロファイルをソースすると、pyenvが機能するようになる。

今後もpyenvを使う場合は毎回初めにsource ~/pyenv_conda_environment/.pyenv_profileを行う。

kosukesano@at137:~$ pyenv
Command 'pyenv' not found, did you mean:
  command 'p7env' from deb libnss3-tools (2:3.68.2-0ubuntu1.2)
Try: apt install <deb name>
kosukesano@at137:~$ source ~/pyenv_conda_environment/.pyenv_profile 
kosukesano@at137:~$ pyenv
pyenv 2.4.0-3-g3ff54e89
Usage: pyenv <command> [<args>]

Some useful pyenv commands are:
   --version   Display the version of pyenv
   .
   .
   .
   .
   .

pyenvにてanaconda3環境の構築

kosukesano@at137:~$ pyenv install  anaconda3-2023.09-0

↑を作業ノード@137で実行したらうまくいかなかった。conda.exeが作業ノードに高負荷を与えていると遺伝研の方から言われた。

原因 condaが重い

メモリをめちゃくちゃ増やしたらなんとかなった

#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 1
#$ -l s_vmem=48G
#$ -l mem_req=48G
date

echo starting at date
source ~/pyenv_conda_environment/.pyenv_profile
pyenv install anaconda3-2020.11

date

/home/kosukesano/.pyenv/versionanaconda3-2020.11を作成した。

~/tools/pyenv_envを作成、その下にbraker_profileを作成した。

source ~/.bash_profile
source ~/pyenv_conda_environment/.pyenv_profile
pyenv global anaconda3-2020.11



# >>> conda initialize >>>
# !! Contents within this block are managed by 'conda init' !!
__conda_setup="$('/home/kosukesano/.pyenv/versions/anaconda3-2020.11/bin/conda' 'shell.bash' 'hook' 2> /dev/null)"
if [ $? -eq 0 ]; then
    eval "$__conda_setup"
else
    if [ -f "/home/kosukesano/.pyenv/versions/anaconda3-2020.11/etc/profile.d/conda.sh" ]; then
        . "/home/kosukesano/.pyenv/versions/anaconda3-2020.11/etc/profile.d/conda.sh"
    else
        export PATH="/home/kosukesano/.pyenv/versions/anaconda3-2020.11/bin:$PATH"
    fi
fi
unset __conda_setup
# <<< conda initialize <<<

conda activate braker

brakerのインストールの前準備

うまくいったやつ

conda install -c anaconda perl
conda install -c anaconda biopython

うまくいかなかったやつ

conda install -c bioconda perl-app-cpanminus
conda install -c bioconda perl-file-spec
conda install -c bioconda perl-hash-merge
conda install -c bioconda perl-list-util
conda install -c bioconda perl-module-load-conditional
conda install -c bioconda perl-posix
conda install -c bioconda perl-file-homedir
conda install -c bioconda perl-parallel-forkmanager
conda install -c bioconda perl-scalar-util-numeric
conda install -c bioconda perl-yaml
conda install -c bioconda perl-class-data-inheritable
conda install -c bioconda perl-exception-class
(braker) kosukesano@at137:~/tools/braker$ conda install -c bioconda perl-list-util
Collecting package metadata (current_repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Solving environment: failed with repodata from current_repodata.json, will retry with next repodata source.

ResolvePackageNotFound: 
  - python=3.1

(braker) kosukesano@at137:~/tools/braker$ conda install -c bioconda perl-module-load-conditional
Collecting package metadata (current_repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Solving environment: failed with repodata from current_repodata.json, will retry with next repodata source.

ResolvePackageNotFound: 
  - python=3.1

こんな感じのエラーが出てインストールできなかった

0507

現在のbraker環境を削除

(braker) kosukesano@at137:~$ conda deactivate
(base) kosukesano@at137:~$ conda remove -n braker --all

Remove all packages in environment /home/kosukesano/.pyenv/versions/anaconda3-2020.11/envs/braker:


## Package Plan ##

  environment location: /home/kosukesano/.pyenv/versions/anaconda3-2020.11/envs/braker


The following packages will be REMOVED:

  _libgcc_mutex-0.1-main
  _openmp_mutex-5.1-1_gnu
  biopython-1.78-py312h5eee18b_0
  blas-1.0-mkl
  bzip2-1.0.8-h7b6447c_0
  ca-certificates-2023.08.22-h06a4308_0
  expat-2.5.0-h6a678d5_0
  gdbm-1.18-hd4cb3f1_4
  intel-openmp-2023.1.0-hdb19cb5_46306
  ld_impl_linux-64-2.38-h1181459_1
  libffi-3.4.4-h6a678d5_0
  libgcc-ng-11.2.0-h1234567_1
  libgomp-11.2.0-h1234567_1
  libstdcxx-ng-11.2.0-h1234567_1
  libuuid-1.41.5-h5eee18b_0
  mkl-2023.1.0-h213fc3f_46344
  mkl-service-2.4.0-py312h5eee18b_1
  ncurses-6.4-h6a678d5_0
  numpy-1.26.0-py312hc5e2394_0
  numpy-base-1.26.0-py312h0da6c21_0
  openssl-3.0.12-h7f8727e_0
  perl-5.34.0-h5eee18b_2
  pip-23.3-py312h06a4308_0
  python-3.12.0-h996f2a0_0
  readline-8.2-h5eee18b_0
  setuptools-68.0.0-py312h06a4308_0
  sqlite-3.41.2-h5eee18b_0
  tbb-2021.8.0-hdb19cb5_0
  tk-8.6.12-h1ccaba5_0
  tzdata-2023c-h04d1e81_0
  wheel-0.41.2-py312h06a4308_0
  xz-5.4.2-h5eee18b_0
  zlib-1.2.13-h5eee18b_0


Proceed ([y]/n)? y

Preparing transaction: done
Verifying transaction: done
Executing transaction: done
(base) kosukesano@at137

改めてbraker環境を構築、python=3.9に指定

(base) kosukesano@at137:~/tools$ conda create -n braker python=3.9
Collecting package metadata (current_repodata.json): done
Solving environment: done


==> WARNING: A newer version of conda exists. <==
  current version: 4.9.2
  latest version: 24.4.0

Please update conda by running

    $ conda update -n base -c defaults conda



## Package Plan ##

  environment location: /home/kosukesano/.pyenv/versions/anaconda3-2020.11/envs/braker

  added / updated specs:
    - python=3.9


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    _libgcc_mutex-0.1          |             main           3 KB
    _openmp_mutex-5.1          |            1_gnu          21 KB
    ca-certificates-2024.3.11  |       h06a4308_0         127 KB
    ld_impl_linux-64-2.38      |       h1181459_1         654 KB
    libffi-3.4.4               |       h6a678d5_1         141 KB
    libgcc-ng-11.2.0           |       h1234567_1         5.3 MB
    libgomp-11.2.0             |       h1234567_1         474 KB
    libstdcxx-ng-11.2.0        |       h1234567_1         4.7 MB
    ncurses-6.4                |       h6a678d5_0         914 KB
    openssl-3.0.13             |       h7f8727e_1         5.2 MB
    pip-23.3.1                 |   py39h06a4308_0         2.6 MB
    python-3.9.19              |       h955ad1f_1        25.1 MB
    readline-8.2               |       h5eee18b_0         357 KB
    setuptools-69.5.1          |   py39h06a4308_0        1003 KB
    sqlite-3.45.3              |       h5eee18b_0         1.2 MB
    tk-8.6.14                  |       h39e8969_0         3.4 MB
    tzdata-2024a               |       h04d1e81_0         116 KB
    wheel-0.43.0               |   py39h06a4308_0         109 KB
    xz-5.4.6                   |       h5eee18b_1         643 KB
    zlib-1.2.13                |       h5eee18b_1         111 KB
    ------------------------------------------------------------
                                           Total:        52.2 MB

The following NEW packages will be INSTALLED:

  _libgcc_mutex      pkgs/main/linux-64::_libgcc_mutex-0.1-main
  _openmp_mutex      pkgs/main/linux-64::_openmp_mutex-5.1-1_gnu
  ca-certificates    pkgs/main/linux-64::ca-certificates-2024.3.11-h06a4308_0
  ld_impl_linux-64   pkgs/main/linux-64::ld_impl_linux-64-2.38-h1181459_1
  libffi             pkgs/main/linux-64::libffi-3.4.4-h6a678d5_1
  libgcc-ng          pkgs/main/linux-64::libgcc-ng-11.2.0-h1234567_1
  libgomp            pkgs/main/linux-64::libgomp-11.2.0-h1234567_1
  libstdcxx-ng       pkgs/main/linux-64::libstdcxx-ng-11.2.0-h1234567_1
  ncurses            pkgs/main/linux-64::ncurses-6.4-h6a678d5_0
  openssl            pkgs/main/linux-64::openssl-3.0.13-h7f8727e_1
  pip                pkgs/main/linux-64::pip-23.3.1-py39h06a4308_0
  python             pkgs/main/linux-64::python-3.9.19-h955ad1f_1
  readline           pkgs/main/linux-64::readline-8.2-h5eee18b_0
  setuptools         pkgs/main/linux-64::setuptools-69.5.1-py39h06a4308_0
  sqlite             pkgs/main/linux-64::sqlite-3.45.3-h5eee18b_0
  tk                 pkgs/main/linux-64::tk-8.6.14-h39e8969_0
  tzdata             pkgs/main/noarch::tzdata-2024a-h04d1e81_0
  wheel              pkgs/main/linux-64::wheel-0.43.0-py39h06a4308_0
  xz                 pkgs/main/linux-64::xz-5.4.6-h5eee18b_1
  zlib               pkgs/main/linux-64::zlib-1.2.13-h5eee18b_1


Proceed ([y]/n)? y


Downloading and Extracting Packages
tk-8.6.14            | 3.4 MB    | #################################################################################################################################################### | 100% 
ca-certificates-2024 | 127 KB    | #################################################################################################################################################### | 100% 
libffi-3.4.4         | 141 KB    | #################################################################################################################################################### | 100% 
_openmp_mutex-5.1    | 21 KB     | #################################################################################################################################################### | 100% 
xz-5.4.6             | 643 KB    | #################################################################################################################################################### | 100% 
ld_impl_linux-64-2.3 | 654 KB    | #################################################################################################################################################### | 100% 
sqlite-3.45.3        | 1.2 MB    | #################################################################################################################################################### | 100% 
python-3.9.19        | 25.1 MB   | #################################################################################################################################################### | 100% 
openssl-3.0.13       | 5.2 MB    | #################################################################################################################################################### | 100% 
pip-23.3.1           | 2.6 MB    | #################################################################################################################################################### | 100% 
libgcc-ng-11.2.0     | 5.3 MB    | #################################################################################################################################################### | 100% 
setuptools-69.5.1    | 1003 KB   | #################################################################################################################################################### | 100% 
zlib-1.2.13          | 111 KB    | #################################################################################################################################################### | 100% 
wheel-0.43.0         | 109 KB    | #################################################################################################################################################### | 100% 
libgomp-11.2.0       | 474 KB    | #################################################################################################################################################### | 100% 
tzdata-2024a         | 116 KB    | #################################################################################################################################################### | 100% 
readline-8.2         | 357 KB    | #################################################################################################################################################### | 100% 
_libgcc_mutex-0.1    | 3 KB      | #################################################################################################################################################### | 100% 
libstdcxx-ng-11.2.0  | 4.7 MB    | #################################################################################################################################################### | 100% 
ncurses-6.4          | 914 KB    | #################################################################################################################################################### | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done
#
# To activate this environment, use
#
#     $ conda activate braker
#
# To deactivate an active environment, use
#
#     $ conda deactivate

(base) kosukesano@at137:~/tools$ source ~/tools/pyenv_env/braker_profile
(braker) kosukesano@at137:~/tools$

環境はこうなった

(braker) kosukesano@at137:~/tools$ conda info

     active environment : braker
    active env location : /home/kosukesano/.pyenv/versions/anaconda3-2020.11/envs/braker
            shell level : 2
       user config file : /home/kosukesano/.condarc
 populated config files : 
          conda version : 4.9.2
    conda-build version : 3.20.5
         python version : 3.8.5.final.0
       virtual packages : __glibc=2.35=0
                          __unix=0=0
                          __archspec=1=x86_64
       base environment : /home/kosukesano/.pyenv/versions/anaconda3-2020.11  (writable)
           channel URLs : https://repo.anaconda.com/pkgs/main/linux-64
                          https://repo.anaconda.com/pkgs/main/noarch
                          https://repo.anaconda.com/pkgs/r/linux-64
                          https://repo.anaconda.com/pkgs/r/noarch
          package cache : /home/kosukesano/.pyenv/versions/anaconda3-2020.11/pkgs
                          /home/kosukesano/.conda/pkgs
       envs directories : /home/kosukesano/.pyenv/versions/anaconda3-2020.11/envs
                          /home/kosukesano/.conda/envs
               platform : linux-64
             user-agent : conda/4.9.2 requests/2.24.0 CPython/3.8.5 Linux/5.15.0-87-generic ubuntu/22.04.3 glibc/2.35
                UID:GID : 6811:10086
             netrc file : None
           offline mode : False

(braker) kosukesano@at137:~/tools$

昔はこうだったんだけど、なんか変わったんか?

(braker) kosukesano@at137:~$ conda info

     active environment : braker
    active env location : /home/kosukesano/.pyenv/versions/anaconda3-2020.11/envs/braker
            shell level : 2
       user config file : /home/kosukesano/.condarc
 populated config files : 
          conda version : 4.9.2
    conda-build version : 3.20.5
         python version : 3.8.5.final.0
       virtual packages : __glibc=2.35=0
                          __unix=0=0
                          __archspec=1=x86_64
       base environment : /home/kosukesano/.pyenv/versions/anaconda3-2020.11  (writable)
           channel URLs : https://repo.anaconda.com/pkgs/main/linux-64
                          https://repo.anaconda.com/pkgs/main/noarch
                          https://repo.anaconda.com/pkgs/r/linux-64
                          https://repo.anaconda.com/pkgs/r/noarch
          package cache : /home/kosukesano/.pyenv/versions/anaconda3-2020.11/pkgs
                          /home/kosukesano/.conda/pkgs
       envs directories : /home/kosukesano/.pyenv/versions/anaconda3-2020.11/envs
                          /home/kosukesano/.conda/envs
               platform : linux-64
             user-agent : conda/4.9.2 requests/2.24.0 CPython/3.8.5 Linux/5.15.0-87-generic ubuntu/22.04.3 glibc/2.35
                UID:GID : 6811:10086
             netrc file : None
           offline mode : False

(braker) kosukesano@at137:~$

改めてbrakerインストールの前準備を行う

conda install -c anaconda perl
conda install -c anaconda biopython
conda install -c bioconda perl-app-cpanminus
conda install -c bioconda perl-file-spec
conda install -c bioconda perl-hash-merge
conda install -c bioconda perl-list-util
conda install -c bioconda perl-module-load-conditional
conda install -c bioconda perl-posix
conda install -c bioconda perl-file-homedir
conda install -c bioconda perl-parallel-forkmanager
conda install -c bioconda perl-scalar-util-numeric
conda install -c bioconda perl-yaml
conda install -c bioconda perl-class-data-inheritable
conda install -c bioconda perl-exception-class
conda install -c bioconda perl-test-pod
# なんか変な出力だったけど、多分うまくいってる
conda install -c bioconda perl-file-which # skip if you are not comparing to reference annotation
(braker) kosukesano@at137:~/tools$ conda install -c bioconda perl-file-which # skip if you are not comparing to reference annotation
Collecting package metadata (current_repodata.json): done
Solving environment: done


==> WARNING: A newer version of conda exists. <==
  current version: 4.9.2
  latest version: 24.4.0

Please update conda by running

    $ conda update -n base -c defaults conda



# All requested packages already installed.

(braker) kosukesano@at137:~/tools$ 

conda install -c bioconda perl-mce
conda install -c bioconda perl-threaded

conda install -c bioconda perl-list-util
(braker) kosukesano@at137:~/tools$ conda install -c bioconda perl-list-util
Collecting package metadata (current_repodata.json): done
Solving environment: done


==> WARNING: A newer version of conda exists. <==
  current version: 4.9.2
  latest version: 24.4.0

Please update conda by running

    $ conda update -n base -c defaults conda



# All requested packages already installed.

(braker) kosukesano@at137:~/tools$ 

conda install -c bioconda perl-math-utils
conda install -c bioconda cdbtools
conda install -c eumetsat perl-yaml-xs
conda install -c bioconda perl-data-dumper

perlモジュールのインストール

#いけたやつ
cpanm Hash::Merge
cpanm List::Util
cpanm MCE::Mutex
cpanm Module::Load::Conditional
cpanm Parallel::Forkcpanm 
cpanm Scalar::Util::Numeric
cpanm YAML
cpanm Math::Utils
cpanm File::HomeDir
cpanm Thread::Queue

#いけなかったやつ
cpanm File::Spec::Functions
cpanm YAML::XS
cpanm Data::Dumper
cpanm threads

#skip?
(braker) kosukesano@at137:~/tools$ cpanm POSIX
skipping R/RJ/RJBS/perl-5.38.0.tar.gz

いけなかったやつについて、x86_64-conda_cos6-linux-gnu-gccがないことが原因らしい。それを入れてみる。

conda install anaconda::gcc_linux-64
# インストールできた

改めてperlのモジュールをインストールしてみる

(braker) kosukesano@at137:~/tools$ cpanm File::Spec::Functions
--> Working on File::Spec::Functions
Fetching http://www.cpan.org/authors/id/X/XS/XSAWYERX/PathTools-3.75.tar.gz ... OK
Configuring PathTools-3.75 ... OK
Building and testing PathTools-3.75 ... FAIL
! Installing File::Spec::Functions failed. See /home/kosukesano/.cpanm/work/1715072449.2386863/build.log for details. Retry with --force to force install it.
(braker) kosukesano@at137:~/tools$

0510

BRAKER本体のインストール

~/tool直下にbraker_git_installというディレクトリを作成し、そこでgit cloneを実行。

BRAKERのgithubページ

(braker) kosukesano@at139:~/tools$ mkdir braker_git_install
(braker) kosukesano@at139:~/tools$ cd braker_git_install/
(braker) kosukesano@at139:~/tools/braker_git_install$ git clone https://github.com/Gaius-Augustus/BRAKER.git
Cloning into 'BRAKER'...
remote: Enumerating objects: 7324, done.
remote: Counting objects: 100% (1666/1666), done.
remote: Compressing objects: 100% (660/660), done.
remote: Total 7324 (delta 1072), reused 1530 (delta 983), pack-reused 5658
Receiving objects: 100% (7324/7324), 123.32 MiB | 20.53 MiB/s, done.
Resolving deltas: 100% (5423/5423), done.
Updating files: 100% (152/152), done.
(braker) kosukesano@at139:~/tools/braker_git_install$ ls
BRAKER
(braker) kosukesano@at139:~/tools/braker_git_install$

BRAKERの内部で動くソフトのインストール

それぞれ~/tool直下にディレクトリを作成し、git cloneでインストールした。

git clone https://github.com/gatech-genemark/ProtHint.git
git clone https://github.com/Gaius-Augustus/TSEBRA.git
git clone https://github.com/gatech-genemark/GeneMark-ETP.git

プロテインデータベースのダウンロード

ダウンロード元

ここからAthropodaのファイルをローカルでダウンロード。遺伝研に移動しgunzipで解凍した。

# ローカルで実行
scp ~/Downloads/Arthropoda.fa.gz kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools

# 遺伝研で実行
gunzip Arthropoda.fa.gz 

BRAKERの内部で動くソフトへのパス開通

BRAKER本体や、GeneMark-ETPなどにパスを通す。 なお、GeneMark-ETPについては、GeneMark-ETP/binだけでなく、その下のGeneMark-ETP/bin/gmesGeneMark-ETP/bin/gmstを個別に用いることがあり、それぞれ別にパスを通す。

パスは全てbraker.profileに追記し、braker.profilesourceすることでパスも一緒に通るようにした。

# braker.profileの追記内容

#################################

export PATH="~/tools/braker_git_install/BRAKER/scripts:$PATH"
export PATH="~/tools/GeneMarkETP_git_install/GeneMark-ETP/bin:$PATH"
export PATH="~/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes:$PATH"
export PATH="~/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmst:$PATH"
export PATH="~/tools/ProtHint_git_install/ProtHint/bin:$PATH"
export PATH="~/tools/TSEBRA_git_install/TSEBRA/bin:$PATH"

マダラのゲノムとタンパク質データベースを用いた BRAKERのテストラン

/home/kosukesano/tools/for_brakertestというディレクトリを作成。その中でbrakertest.shを作成。

# brakertest.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 2
#$ -l s_vmem=24G
#$ -l mem_req=24G
echo start at
date

source ~/tools/pyenv_env/braker_profile
braker.pl --genome=~/gall/out.p_ctg.fa.sort.softmasked.fasta --prot_seq=~/tools/Arthropoda.fa --threads=2

date

ジョブを投げて結果を待つ。

0514

brakertest.shの結果

# brakertest.sh.e26009591の一部

#*********
# WARNING: /lustre7/home/kosukesano/../config is not a directory. Will not set $AUGUSTUS_CONFIG_PATH to /lustre7/home/kosukesano/../config!
#*********
# Fri May 10 16:11:59 2024: Checking /usr/share/augustus/config as potential path for $AUGUSTUS_CONFIG_PATH.
#*********
# WARNING: /usr/share/augustus/config is not a directory. Will not set $AUGUSTUS_CONFIG_PATH to /usr/share/augustus/config!
#*********
# Fri May 10 16:11:59 2024: ERROR: in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 1834
$AUGUSTUS_CONFIG_PATH not set!

AUGUSTUS_CONFIG_PATHが通っていないというエラー。

解決策

# .bash_profileに以下を追加
export AUGUSTUS_CONFIG_PATH="/usr/share/augustus/config/"

その後もパスが通っていないことに関するエラーが多発。

### GENEMARK_PATHが通らないエラー ############################################

# Tue May 14 15:26:11 2024: Found environment variable $GENEMARK_PATH.
# Tue May 14 15:26:11 2024: Checking ~/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes as potential path for $GENEMARK_PATH.
#*********
# WARNING: ~/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes is not a directory. Will not set $GENEMARK_PATH to ~/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes!
#*********
# Tue May 14 15:26:11 2024: Checking ~/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes/gmes/ as potential path for $GENEMARK_PATH.
#*********
# WARNING: ~/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes/gmes/ is not a directory. Will not set $GENEMARK_PATH to ~/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes/gmes/!
#*********
# Tue May 14 15:26:11 2024: ERROR: in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 1834
$GENEMARK_PATH not set!
There are 3 alternative ways to set GENEMARK_PATH for
braker.pl:
   a) provide command-line argument --GENEMARK_PATH=/your/path
   b) use an existing environment variable $GENEMARK_PATH
      for setting the environment variable, run
           export GENEMARK_PATH=/your/path
      in your shell. You may append this to your .bashrc or 
      .profile file in order to make the variable available to
      all your bash sessions.
   c) braker.pl can try guessing the location of 
      GENEMARK_PATH from the location of gmes_petap.pl
      executable if it is available in your $PATH variable.
      If you try to rely on this option, you can check by
      typing
           which gmes_petap.pl
      in your shell, whether the executable is in your $PATH
Tue May 14 15:26:11 JST 2024

### PROTHINT_PATHが通らないエラー ############################################

# Tue May 14 15:27:58 2024: Trying to set $PROTHINT_PATH...
# Tue May 14 15:27:58 2024: Found environment variable $PROTHINT_PATH.
# Tue May 14 15:27:58 2024: Checking ~/tools/ProtHint_git_install/ProtHint/bin/ as potential path for $PROTHINT_PATH.
#*********
# WARNING: ~/tools/ProtHint_git_install/ProtHint/bin/ is not a directory. Will not set $PROTHINT_PATH to ~/tools/ProtHint_git_install/ProtHint/bin/!
#*********
# Tue May 14 15:27:58 2024: ERROR: in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 1834
$PROTHINT_PATH not set!
There are 3 alternative ways to set PROTHINT_PATH for
braker.pl:
   a) provide command-line argument --PROTHINT_PATH=/your/path
   b) use an existing environment variable $PROTHINT_PATH
      for setting the environment variable, run
           export PROTHINT_PATH=/your/path
      in your shell. You may append this to your .bashrc or 
      .profile file in order to make the variable available to
      all your bash sessions.
   c) braker.pl can try guessing the location of 
      PROTHINT_PATH from the location of prothint.py
      executable if it is available in your $PATH variable.
      If you try to rely on this option, you can check by
      typing
           which prothint.py
      in your shell, whether the executable is in your $PATH
Tue May 14 15:27:58 JST 2024

結論として以下の内容をbraker_profileに追記した。

# ~/tools/pyenv_envにあるbraker_profileへの追記内容。

#################################

export AUGUSTUS_CONFIG_PATH="/usr/share/augustus/config/"
export GENEMARK_PATH=~/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes
export PROTHINT_PATH=~/tools/ProtHint_git_install/ProtHint/bin
export TSEBRA_PATH=~/tools/TSEBRA_git_install/TSEBRA/bin
  • AUGUSTUS_CONFIG_PATH="/usr/share/augustus/config/"について
    • パスの最後に/をつける。
    • "の有無が与える影響は不明。これだとうまくいってる。
  • GENEMARK_PATH=~/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmesについて
    • パスの最後には/つけてはいけない
    • "つけてはいけない
    • GENEMARK_PATHだが実際にはGeneMark-ETP/bin/gmes下にあるgmes_petap.plを参照しているため、/gmesまでパスを通す。
  • PROTHINT_PATH=~/tools/ProtHint_git_install/ProtHint/binについて
    • パスの最後には/つけてはいけない
    • "つけてはいけない
  • TSEBRA_PATH=~/tools/TSEBRA_git_install/TSEBRA/binについて
  • パス最後の/の有無が与える影響は不明。これだとうまくいってる。
  • "の有無が与える影響は不明。これだとうまくいってる。

このような処理を行なった後、brakertest.shを(時短のため)ジョブではなく自分の作業ノードで実行。

kosukesano@at137:~/tools/for_brakertest$ bash brakertest.sh
start at
Tue May 14 15:51:40 JST 2024
#**********************************************************************************
#                               BRAKER CONFIGURATION                               
#**********************************************************************************
# BRAKER CALL: /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl --genome=~/gall/out.p_ctg.fa.sort.softmasked.fasta --prot_seq=~/tools/Arthropoda.fa --threads=2
# Tue May 14 15:51:43 2024: braker.pl version 3.0.8
# Tue May 14 15:51:43 2024: Only Protein input detected, BRAKER will be executed in EP mode (BRAKER2).
# Tue May 14 15:51:43 2024: Configuring of BRAKER for using external tools...
# Tue May 14 15:51:43 2024: Trying to set $AUGUSTUS_CONFIG_PATH...
# Tue May 14 15:51:43 2024: Found environment variable $AUGUSTUS_CONFIG_PATH.
# Tue May 14 15:51:43 2024: Checking /usr/share/augustus/config/ as potential path for $AUGUSTUS_CONFIG_PATH.
# Tue May 14 15:51:43 2024: Success! Setting $AUGUSTUS_CONFIG_PATH to /usr/share/augustus/config/!
# Tue May 14 15:51:43 2024: WARNING: in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 1933
AUGUSTUS_CONFIG_PATH/species (in this case /usr/share/augustus/config//species) is not writeable. BRAKER will try to copy the AUGUSTUS config directory to a writeable location.
#**********************************************************************************
#                               BRAKER CONFIGURATION                               
#**********************************************************************************
# BRAKER CALL: /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl --genome=~/gall/out.p_ctg.fa.sort.softmasked.fasta --prot_seq=~/tools/Arthropoda.fa --threads=2
# Tue May 14 15:51:43 2024: braker.pl version 3.0.8
# Tue May 14 15:51:43 2024: Only Protein input detected, BRAKER will be executed in EP mode (BRAKER2).
# Tue May 14 15:51:43 2024: Configuring of BRAKER for using external tools...
# Tue May 14 15:51:43 2024: Trying to set $AUGUSTUS_CONFIG_PATH...
# Tue May 14 15:51:43 2024: Found environment variable $AUGUSTUS_CONFIG_PATH.
# Tue May 14 15:51:43 2024: Checking /usr/share/augustus/config/ as potential path for $AUGUSTUS_CONFIG_PATH.
# Tue May 14 15:51:43 2024: Success! Setting $AUGUSTUS_CONFIG_PATH to /usr/share/augustus/config/!
# Tue May 14 15:51:43 2024: WARNING: in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 1933
AUGUSTUS_CONFIG_PATH/species (in this case /usr/share/augustus/config//species) is not writeable. BRAKER will try to copy the AUGUSTUS config directory to a writeable location.
# Tue May 14 15:51:43 2024: Trying to set $AUGUSTUS_BIN_PATH...
# Tue May 14 15:51:43 2024: Did not find environment variable $AUGUSTUS_BIN_PATH.
# Tue May 14 15:51:43 2024: Trying to guess AUGUSTUS_BIN_PATH from location of augustus executable that is available in your $PATH
# Tue May 14 15:51:43 2024: Checking /usr/bin as potential path for $AUGUSTUS_BIN_PATH.
# Tue May 14 15:51:43 2024: Success! Setting $AUGUSTUS_BIN_PATH to /usr/bin!
# Tue May 14 15:51:43 2024: Trying to set $AUGUSTUS_SCRIPTS_PATH...
# Tue May 14 15:51:43 2024: Did not find environment variable $AUGUSTUS_SCRIPTS_PATH.
# Tue May 14 15:51:43 2024: Checking /usr/share/augustus/config//../scripts as potential path for $AUGUSTUS_SCRIPTS_PATH.
# Tue May 14 15:51:43 2024: Success! Setting $AUGUSTUS_SCRIPTS_PATH to /usr/share/augustus/config//../scripts!
# Tue May 14 15:51:43 2024: WARNING: BRAKER will copy the
 AUGUSTUS_CONFIG folder into your home directory!
# Tue May 14 15:51:43 2024: WARNING: $AUGUSTUS_CONFIG_PATH/species (in this case /usr/share/augustus/config//species ) is not writeable.
*** IMPORTANT: Resetting $AUGUSTUS_CONFIG_PATH=/home/kosukesano/.augustus because BRAKER requires a writable location!
# Tue May 14 15:51:43 2024: Trying to set $PYTHON3_PATH...
# Tue May 14 15:51:43 2024: Did not find environment variable $PYTHON3_PATH.
# Tue May 14 15:51:43 2024: Trying to guess PYTHON3_PATH from location of python3 executable that is available in your $PATH
# Tue May 14 15:51:43 2024: Checking /home/kosukesano/.pyenv/versions/anaconda3-2020.11/envs/braker/bin as potential path for $PYTHON3_PATH.
# Tue May 14 15:51:43 2024: Success! Setting $PYTHON3_PATH to /home/kosukesano/.pyenv/versions/anaconda3-2020.11/envs/braker/bin!
# Tue May 14 15:51:43 2024: Trying to set $GENEMARK_PATH...
# Tue May 14 15:51:43 2024: Found environment variable $GENEMARK_PATH.
# Tue May 14 15:51:43 2024: Checking /home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes as potential path for $GENEMARK_PATH.
# Tue May 14 15:51:43 2024: Success! Setting $GENEMARK_PATH to /home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes!
# Tue May 14 15:51:43 2024: Trying to set $DIAMOND_PATH...
# Tue May 14 15:51:43 2024: Did not find environment variable $DIAMOND_PATH.
# Tue May 14 15:51:43 2024: Trying to guess DIAMOND_PATH from location of diamond executable that is available in your $PATH
# Tue May 14 15:51:43 2024: Checking /usr/bin as potential path for $DIAMOND_PATH.
# Tue May 14 15:51:43 2024: Success! Setting $DIAMOND_PATH to /usr/bin!
# Tue May 14 15:51:43 2024: Trying to set $PROTHINT_PATH...
# Tue May 14 15:51:43 2024: Found environment variable $PROTHINT_PATH.
# Tue May 14 15:51:43 2024: Checking /home/kosukesano/tools/ProtHint_git_install/ProtHint/bin as potential path for $PROTHINT_PATH.
# Tue May 14 15:51:43 2024: Success! Setting $PROTHINT_PATH to /home/kosukesano/tools/ProtHint_git_install/ProtHint/bin!
# Tue May 14 15:51:43 2024: Trying to set $TSEBRA_PATH...
# Tue May 14 15:51:43 2024: Found environment variable $TSEBRA_PATH.
# Tue May 14 15:51:43 2024: Checking /home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin as potential path for $TSEBRA_PATH.
# Tue May 14 15:51:43 2024: Success! Setting $TSEBRA_PATH to /home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin!
# Tue May 14 15:51:43 2024: Trying to set $CDBTOOLS_PATH...
# Tue May 14 15:51:43 2024: Did not find environment variable $CDBTOOLS_PATH.
# Tue May 14 15:51:43 2024: Trying to guess CDBTOOLS_PATH from location of cdbfasta executable that is available in your $PATH
# Tue May 14 15:51:43 2024: Checking /home/kosukesano/.pyenv/versions/anaconda3-2020.11/envs/braker/bin as potential path for $CDBTOOLS_PATH.
# Tue May 14 15:51:43 2024: Success! Setting $CDBTOOLS_PATH to /home/kosukesano/.pyenv/versions/anaconda3-2020.11/envs/braker/bin!
# Tue May 14 15:51:45 2024: BRAKER will execute GeneMark-EP for training GeneMark and generating a training gene set for AUGUSTUS, using protein information as sole extrinsic evidence source.
#*********
# IMPORTANT INFORMATION: no species for identifying the AUGUSTUS  parameter set that will arise from this BRAKER run was set. BRAKER will create an AUGUSTUS parameter set with name Sp_1. This parameter set can be used for future BRAKER/AUGUSTUS prediction runs for the same species. It is usually not necessary to retrain AUGUSTUS with novel extrinsic data if a high quality parameter set already exists.
#*********
# Tue May 14 15:51:45 2024: ERROR: in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 3633
protein sequence file /lustre7/home/kosukesano/tools/for_brakertest/~/tools/Arthropoda.fa does not exist.
Tue May 14 15:51:45 JST 2024
kosukesano@at137:~/tools/for_brakertest$ 

protein sequence file does not exist. ???

0515

カレントディレクトリでbash brakertest.shを実行するとBRAKERが走る?

#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 2
#$ -l s_vmem=1G
#$ -l mem_req=1G
echo start at
date

source ~/tools/pyenv_env/braker_profile
braker.pl --genome=/home/kosukesano/gall/out.p_ctg.fa.sort.softmasked.fasta --prot_seq=/home/kosukesano/tools/Arthropoda.fa --threads=2

date

Arthropoda.fa does not exist.というエラーは入力ファイルを絶対指定していない事が原因らしい。絶対指定したところエラーがなくなった。

実行後にはbrakerというディレクトリが~/tools/for_brakertest下にできる。しかしこれが邪魔するのか、もう一度実行しようとすると以下のエラーが生じる。

kosukesano@at137:~/tools/for_brakertest$ bash brakertest.sh
start at
Wed May 15 15:33:32 JST 2024
#**********************************************************************************
#                               BRAKER CONFIGURATION                               
#**********************************************************************************
# BRAKER CALL: /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl --genome=/home/kosukesano/gall/out.p_ctg.fa.sort.softmasked.fasta --prot_seq=/home/kosukesano/tools/Arthropoda.fa --threads=2
# Wed May 15 15:33:33 2024: braker.pl version 3.0.8
# Wed May 15 15:33:33 2024: Only Protein input detected, BRAKER will be executed in EP mode (BRAKER2).
# Wed May 15 15:33:33 2024: Configuring of BRAKER for using external tools...
# Wed May 15 15:33:33 2024: Trying to set $AUGUSTUS_CONFIG_PATH...
# Wed May 15 15:33:33 2024: Found environment variable $AUGUSTUS_CONFIG_PATH.
# Wed May 15 15:33:33 2024: Checking /usr/share/augustus/config/ as potential path for $AUGUSTUS_CONFIG_PATH.
# Wed May 15 15:33:33 2024: Success! Setting $AUGUSTUS_CONFIG_PATH to /usr/share/augustus/config/!
# Wed May 15 15:33:33 2024: WARNING: in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 1933
AUGUSTUS_CONFIG_PATH/species (in this case /usr/share/augustus/config//species) is not writeable. BRAKER will try to copy the AUGUSTUS config directory to a writeable location.
# Wed May 15 15:33:33 2024: Log information is stored in file /lustre7/home/kosukesano/tools/for_brakertest/braker/braker.log
ERROR in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 1240
Failed to create direcotry /lustre7/home/kosukesano/tools/for_brakertest/braker/GeneMark-ES!
Wed May 15 15:33:33 JST 2024
kosukesano@at137:~/tools/for_brakertest$

同じファイルをジョブとして投げるとAUGUSTUSのパスが通らない

Use of uninitialized value in subroutine entry at /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl line 1920.
#**********************************************************************************
#                               BRAKER CONFIGURATION                               
#**********************************************************************************
# BRAKER CALL: /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl --genome=/home/kosukesano/gall/out.p_ctg.fa.sort.softmasked.fasta --prot_seq=/home/kosukesano/tools/Arthropoda.fa --threads=2 --GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes --PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin --TSEBRA_PATH=home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin --
# Wed May 15 15:26:19 2024: braker.pl version 3.0.8
# Wed May 15 15:26:19 2024: Only Protein input detected, BRAKER will be executed in EP mode (BRAKER2).
# Wed May 15 15:26:19 2024: Configuring of BRAKER for using external tools...
# Wed May 15 15:26:19 2024: Trying to set $AUGUSTUS_CONFIG_PATH...
# Wed May 15 15:26:19 2024: Found environment variable $AUGUSTUS_CONFIG_PATH.
# Wed May 15 15:26:19 2024: Checking /usr/share/augustus/config/ as potential path for $AUGUSTUS_CONFIG_PATH.
#*********
# WARNING: /usr/share/augustus/config/ is not a directory. Will not set $AUGUSTUS_CONFIG_PATH to /usr/share/augustus/config/!
#*********
# Wed May 15 15:26:19 2024: Checking /lustre7/home/kosukesano/tools/../config as potential path for $AUGUSTUS_CONFIG_PATH.
#*********
# WARNING: /lustre7/home/kosukesano/tools/../config is not a directory. Will not set $AUGUSTUS_CONFIG_PATH to /lustre7/home/kosukesano/tools/../config!
#*********
# Wed May 15 15:26:19 2024: Checking /usr/share/augustus/config as potential path for $AUGUSTUS_CONFIG_PATH.
#*********
# WARNING: /usr/share/augustus/config is not a directory. Will not set $AUGUSTUS_CONFIG_PATH to /usr/share/augustus/config!
#*********
# Wed May 15 15:26:19 2024: ERROR: in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 1834
$AUGUSTUS_CONFIG_PATH not set!
There are 3 alternative ways to set this variable for braker.pl:
   a) provide command-line argument --AUGUSTUS_CONFIG_PATH=/your/path
   b) use an existing environment variable $AUGUSTUS_CONFIG_PATH
      for setting the environment variable, run
           export AUGUSTUS_CONFIG_PATH=/your/path
      in your shell. You may append this to your .bashrc or
      .profile file in order to make the variable available to all
      your bash sessions.
   c) braker.pl can try guessing the location of
      $AUGUSTUS_CONFIG_PATH from an augustus executable that is
      available in your $PATH variable.
      If you try to rely on this option, you can check by typing
           which augustus
      in your shell, whether there is an augustus executable in
      your $PATH
      Be aware: the $AUGUSTUS_CONFIG_PATH must be writable for
                braker.pl because braker.pl is a pipeline that
                optimizes parameters that reside in that
                directory. This might be problematic in case you
                are using a system-wide installed augustus 
                installation that resides in a directory that is
                not writable to you as a user.

前に見たエラーと同じ……。

パスが認識されていない?

0520

pyenv下でmambaforge環境を作成

EDTA全般の参考ページ

遺伝研の作業ノードで実行した。

kosukesano@at138:~/pyenv_conda_environment$ pyenv install mambaforge-22.9.0-3
Downloading Mambaforge-22.9.0-3-Linux-x86_64.sh.sh...
-> https://github.com/conda-forge/miniforge/releases/download/22.9.0-3/Mambaforge-22.9.0-3-Linux-x86_64.sh
Installing Mambaforge-22.9.0-3-Linux-x86_64.sh...
Collecting package metadata (current_repodata.json): done
Solving environment: done


==> WARNING: A newer version of conda exists. <==
  current version: 22.9.0
  latest version: 24.5.0

Please update conda by running

    $ conda update -n base -c conda-forge conda



## Package Plan ##

  environment location: /lustre7/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3

  added / updated specs:
    - conda=22.9.0
    - pip


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2024.2.2   |       hbcca054_0         152 KB  conda-forge
    certifi-2024.2.2           |     pyhd8ed1ab_0         157 KB  conda-forge
    openssl-3.3.0              |       hd590300_0         2.8 MB  conda-forge
    pip-24.0                   |     pyhd8ed1ab_0         1.3 MB  conda-forge
    ------------------------------------------------------------
                                           Total:         4.4 MB

The following packages will be UPDATED:

  ca-certificates                      2022.12.7-ha878542_0 --> 2024.2.2-hbcca054_0 None
  certifi                            2022.12.7-pyhd8ed1ab_0 --> 2024.2.2-pyhd8ed1ab_0 None
  openssl                                  3.0.7-h0b41bf4_1 --> 3.3.0-hd590300_0 None
  pip                                   22.3.1-pyhd8ed1ab_0 --> 24.0-pyhd8ed1ab_0 None



Downloading and Extracting Packages
pip-24.0             | 1.3 MB    | #################################################################################################################################################### | 100% 
openssl-3.3.0        | 2.8 MB    | #################################################################################################################################################### | 100% 
certifi-2024.2.2     | 157 KB    | #################################################################################################################################################### | 100% 
ca-certificates-2024 | 152 KB    | #################################################################################################################################################### | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done
Retrieving notices: ...working... done
Installed Mambaforge-22.9.0-3-Linux-x86_64.sh to /home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3
kosukesano@at138:~/pyenv_conda_environment$

EDTA環境の構築

~/tools/pyenv_env下にEDTA_profileを作成

### EDTA_profileの中身

source ~/.bash_profile
source ~/pyenv_conda_environment/.pyenv_profile
pyenv global mambaforge-22.9.0-3



# >>> conda initialize >>>
# !! Contents within this block are managed by 'conda init' !!
__conda_setup="$('/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/bin/conda' 'shell.bash' 'hook' 2> /dev/null)"
if [ $? -eq 0 ]; then
    eval "$__conda_setup"
else
    if [ -f "/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3//etc/profile.d/conda.sh" ]; then
        . "/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/etc/profile.d/conda.sh"
    else
        export PATH="/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/bin:$PATH"
    fi
fi
unset __conda_setup
if [ -f "/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/etc/profile.d/mamba.sh" ]; then
    . "/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/etc/profile.d/mamba.sh"
fi
# <<< conda initialize <<<

conda activate EDTA2

(これをsourceすればEDTAが動く)

上記シェルスクリプトをsourceしてmambaforge環境を立ち上げたのち、以下のコマンドでEDTAをインストール、EDTA2という環境を構築した

git clone https://github.com/oushujun/EDTA.git
cd EDTA
mamba env create -f EDTA_2.2.x.yml 
conda activate EDTA2

最新のマダラゲノムを用いたsoftmaskの復習

### 最新のマダラゲノムを遺伝研に転送
scp /Volumes/Elements_1/240514_new_weebil_genome/231117_madaragenome_fasta kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano

### nama_dataというディレクトリにそれを格納、ついでに拡張子を.fastaにしておく
(EDTA2) kosukesano@at138:~$ mv 231117_madaragenome_fasta ~/tools/for_softmask/nama_data
(EDTA2) kosukesano@at138:~$ cd ~/tools/for_softmask/nama_data
(EDTA2) kosukesano@at138:~/tools/for_softmask/nama_data$ ls
231117_madaragenome_fasta
(EDTA2) kosukesano@at138:~/tools/for_softmask/nama_data$ less 231117_madaragenome_fasta 
(EDTA2) kosukesano@at138:~/tools/for_softmask/nama_data$ mv 231117_madaragenome_fasta ~/tools/for_softmask/nama_data/231117_madaragenome.fasta
(EDTA2) kosukesano@at138:~/tools/for_softmask/nama_data$ 

### 下準備、BLAST_DATABASE_PREFIXという名前で、参照データベースを作成(作業ノードで実行)
(EDTA2) kosukesano@at138:~/tools/for_softmask$ BuildDatabase -name BLAST_DATABASE_PREFIX /home/kosukesano/tools/for_softmask/nama_data/231117_madaragenome.fasta
Building database BLAST_DATABASE_PREFIX:
  Reading /home/kosukesano/tools/for_softmask/nama_data/231117_madaragenome.fasta...
Number of sequences (bp) added to database: 209 ( 1295393365 bp )
(EDTA2) kosukesano@at138:~/tools/for_softmask$

RepeatModelerの実行

### RepeatModeler_test.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 2
#$ -l s_vmem=12G
#$ -l mem_req=12G
echo start at
date

source ~/tools/pyenv_env/EDTA_profile

RepeatModeler -database BLAST_DATABASE_PREFIX  -pa 12
date

0521

RepeatModeler_test.shの結果

### RepeatModeler_test.sh.e26118406 (エラーメッセージ)

ERROR from search engine (0) 

### RM_78034.MonMay201654522024(出力ファイル)
(EDTA2) kosukesano@at138:~/tools/for_softmask$ ls RM_78034.MonMay201654522024/
consensi.fa  families.stk  round-1  round-2
(EDTA2) kosukesano@at138:~/tools/for_softmask$

メモリが足りなかった?とりあえずメモリを48にしてもう一度qsub_beta

/usr/share/augustus/configを自身のホームディレクトリに再帰的にコピー

cp -r /usr/share/config ~/tools/AUGUSTUS_CONFIG_copy

(braker) kosukesano@at138:~/tools/AUGUSTUS_CONFIG_copy/config$ ls
cgp  extrinsic  model  profile  species
(braker) kosukesano@at138:~/tools/AUGUSTUS_CONFIG_copy/config$ 

ここをパスに指定してqsub

#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 2
#$ -l s_vmem=1G
#$ -l mem_req=1G
echo start at
date

source /home/kosukesano/tools/pyenv_env/braker_profile

braker.pl --genome=/home/kosukesano/gall/out.p_ctg.fa.sort.softmasked.fasta --prot_seq=/home/kosukesano/tools/Arthropoda.fa --threads=2\
        --AUGUSTUS_CONFIG_PATH=/home/kosukesano/tools/AUGUSTUS_CONFIG_copy/config\
        --GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes\
        --PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin --TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin

date

結果

Use of uninitialized value in subroutine entry at /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl line 1920.
#**********************************************************************************
#                               BRAKER CONFIGURATION                               
#**********************************************************************************
# BRAKER CALL: /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl --genome=/home/kosukesano/gall/out.p_ctg.fa.sort.softmasked.fasta --prot_seq=/home/kosukesano/tools/Arthropod
a.fa --threads=2 --AUGUSTUS_CONFIG_PATH=/home/kosukesano/tools/AUGUSTUS_CONFIG_copy/config --GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes --PROTHINT_PATH
=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin --TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin
# Tue May 21 13:04:42 2024: braker.pl version 3.0.8
# Tue May 21 13:04:42 2024: Only Protein input detected, BRAKER will be executed in EP mode (BRAKER2).
# Tue May 21 13:04:42 2024: Configuring of BRAKER for using external tools...
# Tue May 21 13:04:42 2024: Trying to set $AUGUSTUS_CONFIG_PATH...
# Tue May 21 13:04:42 2024: Found command line argument $AUGUSTUS_CONFIG_PATH.
# Tue May 21 13:04:42 2024: Checking /home/kosukesano/tools/AUGUSTUS_CONFIG_copy/config as potential path for $AUGUSTUS_CONFIG_PATH.
# Tue May 21 13:04:42 2024: Success! Setting $AUGUSTUS_CONFIG_PATH to /home/kosukesano/tools/AUGUSTUS_CONFIG_copy/config!
# Tue May 21 13:04:42 2024: Trying to set $AUGUSTUS_BIN_PATH...
# Tue May 21 13:04:42 2024: Found environment variable $AUGUSTUS_BIN_PATH.
# Tue May 21 13:04:42 2024: Checking /usr/bin as potential path for $AUGUSTUS_BIN_PATH.
#*********
# WARNING: Couldn't find augustus in /usr/bin. Will not set $AUGUSTUS_BIN_PATH to /usr/bin!
#*********
# Tue May 21 13:04:42 2024: Checking /home/kosukesano/tools/AUGUSTUS_CONFIG_copy/config/../bin as potential path for $AUGUSTUS_BIN_PATH.
#*********
# WARNING: /home/kosukesano/tools/AUGUSTUS_CONFIG_copy/config/../bin is not a directory. Will not set $AUGUSTUS_BIN_PATH to /home/kosukesano/tools/AUGUSTUS_CONFIG_copy/config/../bin!
#*********
# Tue May 21 13:04:42 2024: Checking /usr/share/augustus/bin as potential path for $AUGUSTUS_BIN_PATH.
#*********
# WARNING: /usr/share/augustus/bin is not a directory. Will not set $AUGUSTUS_BIN_PATH to /usr/share/augustus/bin!
#*********
# Tue May 21 13:04:42 2024: ERROR: in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 1834
$AUGUSTUS_BIN_PATH not set!
There are 3 alternative ways to set this variable for
braker.pl:
   a) provide command-line argument 
      --AUGUSTUS_BIN_PATH=/your/path
   b) use an existing environment variable $AUGUSTUS_BIN_PATH
      for setting the environment variable, run
           export AUGUSTUS_BIN_PATH=/your/path
      in your shell. You may append this to your .bashrc or
      .profile file in order to make the variable available to
      all your bash sessions.
   c) braker.pl can try guessing the location of 
      $AUGUSTUS_BIN_PATH from the location of 
      $AUGUSTUS_CONFIG_PATH (in this case
      /home/kosukesano/tools/AUGUSTUS_CONFIG_copy/config/../bin

augustus関係のファイルを全部自分のディレクトリにコピー

~/tools/All_AUGUSTUS_testというディレクトリを作成し、その下に/usr/share/augustusを全てコピーした。また、その下に/binディレクトリを作成し、そこに`/usr/bin/augustus/をコピーした。

また、DIAMONDSというツールが要求されたので、~/tools/DIAMOND_git_installを作成し、その下にgitでインストールした。

(braker) kosukesano@at138:~/tools$ mkdir DIAMOND_git_install
(braker) kosukesano@at138:~/tools$ cd DIAMOND_git_install/
(braker) kosukesano@at138:~/tools/DIAMOND_git_install$     wget http://github.com/bbuchfink/diamond/releases/download/v0.9.24/diamond-linux64.tar.gz
    tar xzf diamond-linux64.tar.gz
--2024-05-21 15:30:41--  http://github.com/bbuchfink/diamond/releases/download/v0.9.24/diamond-linux64.tar.gz
Resolving github.com (github.com)... 20.27.177.113
Connecting to github.com (github.com)|20.27.177.113|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://github.com/bbuchfink/diamond/releases/download/v0.9.24/diamond-linux64.tar.gz [following]
--2024-05-21 15:30:41--  https://github.com/bbuchfink/diamond/releases/download/v0.9.24/diamond-linux64.tar.gz
Connecting to github.com (github.com)|20.27.177.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/31987083/313cc780-09dd-11e9-902e-599c1618e37d?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%2F20240521%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240521T063041Z&X-Amz-Expires=300&X-Amz-Signature=66499e2d5de74c872454dd8ac0770632c87059513935b1208862ed19b28f4121&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=31987083&response-content-disposition=attachment%3B%20filename%3Ddiamond-linux64.tar.gz&response-content-type=application%2Foctet-stream [following]
--2024-05-21 15:30:41--  https://objects.githubusercontent.com/github-production-release-asset-2e65be/31987083/313cc780-09dd-11e9-902e-599c1618e37d?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%2F20240521%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240521T063041Z&X-Amz-Expires=300&X-Amz-Signature=66499e2d5de74c872454dd8ac0770632c87059513935b1208862ed19b28f4121&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=31987083&response-content-disposition=attachment%3B%20filename%3Ddiamond-linux64.tar.gz&response-content-type=application%2Foctet-stream
Resolving objects.githubusercontent.com (objects.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to objects.githubusercontent.com (objects.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2418573 (2.3M) [application/octet-stream]
Saving to: ‘diamond-linux64.tar.gz’

diamond-linux64.tar.gz                          100%[======================================================================================================>]   2.31M  --.-KB/s    in 0.05s   

2024-05-21 15:30:43 (50.0 MB/s) - ‘diamond-linux64.tar.gz’ saved [2418573/2418573]

(braker) kosukesano@at138:~/tools/DIAMOND_git_install$ ls
diamond  diamond-linux64.tar.gz  diamond_manual.pdf
(braker) kosukesano@at138:~/tools/DIAMOND_git_install$

brakertest.shのスクリプトとしてパスを通した


#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 2
#$ -l s_vmem=1G
#$ -l mem_req=1G
echo start at
date

source /home/kosukesano/tools/pyenv_env/braker_profile

braker.pl --genome=/home/kosukesano/gall/out.p_ctg.fa.sort.softmasked.fasta --prot_seq=/home/kosukesano/tools/Arthropoda.fa --threads=2\
        --AUGUSTUS_CONFIG_PATH=/home/kosukesano/tools/All_AUGUSTUS_test/augustus/config\
        --AUGUSTUS_BIN_PATH=/home/kosukesano/tools/All_AUGUSTUS_test/bin\
        --AUGUSTUS_SCRIPTS_PATH=/home/kosukesano/tools/All_AUGUSTUS_test/augustus/scripts\
        --DIAMOND_PATH=/home/kosukesano/tools/DIAMOND_git_install\
        --GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes\
        --PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin --TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin

date

結果

Use of uninitialized value in subroutine entry at /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl line 1920.
#**********************************************************************************
#                               BRAKER CONFIGURATION                               
#**********************************************************************************
# BRAKER CALL: /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl --genome=/home/kosukesano/gall/out.p_ctg.fa.sort.softmasked.fasta --prot_seq=/home/kosukesano/tools/Arthropoda.fa --threads=2 --AUGUSTUS_CONFIG_PATH=/home/kosukesano/tools/All_AUGUSTUS_test/augustus/config --AUGUSTUS_BIN_PATH=/home/kosukesano/tools/All_AUGUSTUS_test/bin --AUGUSTUS_SCRIPTS_PATH=/home/kosukesano/tools/All_AUGUSTUS_test/augustus/scripts --DIAMOND_PATH=/home/kosukesano/tools/DIAMOND_git_install --GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes --PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin --TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin
# Tue May 21 15:36:14 2024: braker.pl version 3.0.8
# Tue May 21 15:36:14 2024: Only Protein input detected, BRAKER will be executed in EP mode (BRAKER2).
# Tue May 21 15:36:14 2024: Configuring of BRAKER for using external tools...
# Tue May 21 15:36:14 2024: Trying to set $AUGUSTUS_CONFIG_PATH...
# Tue May 21 15:36:14 2024: Found command line argument $AUGUSTUS_CONFIG_PATH.
# Tue May 21 15:36:14 2024: Checking /home/kosukesano/tools/All_AUGUSTUS_test/augustus/config as potential path for $AUGUSTUS_CONFIG_PATH.
# Tue May 21 15:36:14 2024: Success! Setting $AUGUSTUS_CONFIG_PATH to /home/kosukesano/tools/All_AUGUSTUS_test/augustus/config!
# Tue May 21 15:36:14 2024: Trying to set $AUGUSTUS_BIN_PATH...
# Tue May 21 15:36:14 2024: Found command line argument $AUGUSTUS_BIN_PATH.
# Tue May 21 15:36:14 2024: Checking /home/kosukesano/tools/All_AUGUSTUS_test/bin as potential path for $AUGUSTUS_BIN_PATH.
# Tue May 21 15:36:14 2024: Success! Setting $AUGUSTUS_BIN_PATH to /home/kosukesano/tools/All_AUGUSTUS_test/bin!
# Tue May 21 15:36:14 2024: Trying to set $AUGUSTUS_SCRIPTS_PATH...
# Tue May 21 15:36:14 2024: Found command line argument $AUGUSTUS_SCRIPTS_PATH.
# Tue May 21 15:36:14 2024: Checking /home/kosukesano/tools/All_AUGUSTUS_test/augustus/scripts as potential path for $AUGUSTUS_SCRIPTS_PATH.
# Tue May 21 15:36:14 2024: Success! Setting $AUGUSTUS_SCRIPTS_PATH to /home/kosukesano/tools/All_AUGUSTUS_test/augustus/scripts!
# Tue May 21 15:36:14 2024: Trying to set $PYTHON3_PATH...
# Tue May 21 15:36:14 2024: Did not find environment variable $PYTHON3_PATH.
# Tue May 21 15:36:14 2024: Trying to guess PYTHON3_PATH from location of python3 executable that is available in your $PATH
# Tue May 21 15:36:14 2024: Checking /home/kosukesano/.pyenv/versions/anaconda3-2020.11/envs/braker/bin as potential path for $PYTHON3_PATH.
# Tue May 21 15:36:14 2024: Success! Setting $PYTHON3_PATH to /home/kosukesano/.pyenv/versions/anaconda3-2020.11/envs/braker/bin!
# Tue May 21 15:36:14 2024: Trying to set $GENEMARK_PATH...
# Tue May 21 15:36:14 2024: Found command line argument $GENEMARK_PATH.
# Tue May 21 15:36:14 2024: Checking /home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes as potential path for $GENEMARK_PATH.
# Tue May 21 15:36:14 2024: Success! Setting $GENEMARK_PATH to /home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes!
# Tue May 21 15:36:14 2024: Trying to set $DIAMOND_PATH...
# Tue May 21 15:36:14 2024: Found command line argument $DIAMOND_PATH.
# Tue May 21 15:36:14 2024: Checking /home/kosukesano/tools/DIAMOND_git_install as potential path for $DIAMOND_PATH.
# Tue May 21 15:36:14 2024: Success! Setting $DIAMOND_PATH to /home/kosukesano/tools/DIAMOND_git_install!
# Tue May 21 15:36:14 2024: Trying to set $PROTHINT_PATH...
# Tue May 21 15:36:14 2024: Found command line argument $PROTHINT_PATH.
# Tue May 21 15:36:14 2024: Checking /home/kosukesano/tools/ProtHint_git_install/ProtHint/bin as potential path for $PROTHINT_PATH.
# Tue May 21 15:36:14 2024: Success! Setting $PROTHINT_PATH to /home/kosukesano/tools/ProtHint_git_install/ProtHint/bin!
# Tue May 21 15:36:14 2024: Trying to set $TSEBRA_PATH...
# Tue May 21 15:36:14 2024: Found command line argument $TSEBRA_PATH.
# Tue May 21 15:36:14 2024: Checking /home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin as potential path for $TSEBRA_PATH.
# Tue May 21 15:36:14 2024: Success! Setting $TSEBRA_PATH to /home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin!
# Tue May 21 15:36:14 2024: Trying to set $CDBTOOLS_PATH...
# Tue May 21 15:36:14 2024: Did not find environment variable $CDBTOOLS_PATH.
# Tue May 21 15:36:18 2024: Trying to guess CDBTOOLS_PATH from location of cdbfasta executable that is available in your $PATH
# Tue May 21 15:36:18 2024: Checking /home/kosukesano/.pyenv/versions/anaconda3-2020.11/envs/braker/bin as potential path for $CDBTOOLS_PATH.
# Tue May 21 15:36:18 2024: Success! Setting $CDBTOOLS_PATH to /home/kosukesano/.pyenv/versions/anaconda3-2020.11/envs/braker/bin!
# Tue May 21 15:36:18 2024: ERROR: in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 2553
/home/kosukesano/tools/All_AUGUSTUS_test/bin/augustus not executable on this machine.

~/tools/for_brakertest/share_of_augustusを作成、その下で遺伝研の`augustus`を使うスクリプトを実行

(braker) kosukesano@at138:~/tools/for_brakertest/share_of_augustus$ bash share_brakertest.sh
start at
Tue May 21 16:16:56 JST 2024
#**********************************************************************************
#                               BRAKER CONFIGURATION                               
#**********************************************************************************
# BRAKER CALL: /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl --genome=/home/kosukesano/gall/out.p_ctg.fa.sort.softmasked.fasta --prot_seq=/home/kosukesano/tools/Arthropoda.fa --threads=2 --AUGUSTUS_CONFIG_PATH=/usr/share/augustus/config/ --GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes --PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin --TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin
# Tue May 21 16:16:59 2024: braker.pl version 3.0.8
# Tue May 21 16:16:59 2024: Only Protein input detected, BRAKER will be executed in EP mode (BRAKER2).
# Tue May 21 16:16:59 2024: Configuring of BRAKER for using external tools...
# Tue May 21 16:16:59 2024: Trying to set $AUGUSTUS_CONFIG_PATH...
# Tue May 21 16:16:59 2024: Found command line argument $AUGUSTUS_CONFIG_PATH.
# Tue May 21 16:16:59 2024: Checking /usr/share/augustus/config as potential path for $AUGUSTUS_CONFIG_PATH.
# Tue May 21 16:16:59 2024: Success! Setting $AUGUSTUS_CONFIG_PATH to /usr/share/augustus/config!
# Tue May 21 16:16:59 2024: WARNING: in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 1933
AUGUSTUS_CONFIG_PATH/species (in this case /usr/share/augustus/config/species) is not writeable. BRAKER will try to copy the AUGUSTUS config directory to a writeable location.
# Tue May 21 16:17:05 2024: Log information is stored in file /lustre7/home/kosukesano/tools/for_brakertest/share_of_augustus/braker/braker.log
^C
(braker) kosukesano@at138:~/tools/for_brakertest/share_of_augustus$ ls
braker               share_brakertest.sh.e26120966  share_brakertest.sh.o26120966  share_brakertest.sh.pe26120966  share_brakertest.sh.po26120966
share_brakertest.sh  share_brakertest.sh.e26120972  share_brakertest.sh.o26120972  share_brakertest.sh.pe26120972  share_brakertest.sh.po26120972
(braker) kosukesano@at138:~/tools/for_brakertest/share_of_augustus$ 

0522

gpuノードに投げたらなんか途中まで動いた

#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 2
#$ -l s_vmem=1G
#$ -l mem_req=1G
echo start at
date

source /home/kosukesano/tools/pyenv_env/braker_profile

braker.pl --genome=/home/kosukesano/gall/out.p_ctg.fa.sort.softmasked.fasta --prot_seq=/home/kosukesano/tools/Arthropoda.fa --threads=2\
        --AUGUSTUS_CONFIG_PATH=/home/kosukesano/tools/AUGUSTUS_CONFIG_copy/config\
        --AUGUSTUS_BIN_PATH=/usr/bin\
        --AUGUSTUS_SCRIPTS_PATH=/usr/share/augustus/scripts\
        --GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes\
        --PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin\
        --TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin

date

AUGUSTUS_CONFIG_path/usr/share/augustus/configに設定した時のエラー

### share_brakertest.sh.e26123608

#**********************************************************************************
#                               BRAKER CONFIGURATION                               
#**********************************************************************************
# BRAKER CALL: /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl --genome=/home/kosukesano/gall/out.p_ctg.fa.sort.softmasked.fasta --prot_seq=/home/kosukesano/tools/Arthropoda.fa --threads=2 --GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes --PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin --TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin
# Wed May 22 11:12:37 2024: braker.pl version 3.0.8
# Wed May 22 11:12:37 2024: Only Protein input detected, BRAKER will be executed in EP mode (BRAKER2).
# Wed May 22 11:12:37 2024: Configuring of BRAKER for using external tools...
# Wed May 22 11:12:37 2024: Trying to set $AUGUSTUS_CONFIG_PATH...
# Wed May 22 11:12:37 2024: Found environment variable $AUGUSTUS_CONFIG_PATH.
# Wed May 22 11:12:37 2024: Checking /usr/share/augustus/config as potential path for $AUGUSTUS_CONFIG_PATH.
# Wed May 22 11:12:37 2024: Success! Setting $AUGUSTUS_CONFIG_PATH to /usr/share/augustus/config!
# Wed May 22 11:12:37 2024: WARNING: in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 1933
AUGUSTUS_CONFIG_PATH/species (in this case /usr/share/augustus/config/species) is not writeable. BRAKER will try to copy the AUGUSTUS config directory to a writeable location.
ERROR in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 1240
Failed to create direcotry /lustre7/home/kosukesano/tools/for_brakertest/share_of_augustus/braker/GeneMark-ES!

もう一回やったら別のエラーが出た

### share_brakertest.sh.e26123620

#**********************************************************************************
#                               BRAKER CONFIGURATION                               
#**********************************************************************************
# BRAKER CALL: /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl --genome=/home/kosukesano/gall/out.p_ctg.fa.sort.softmasked.fasta --prot_seq=/home/kosukesano/tools/Arthropoda.fa --threads=2 --GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes --PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin --TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin
# Wed May 22 11:14:38 2024: braker.pl version 3.0.8
# Wed May 22 11:14:38 2024: Only Protein input detected, BRAKER will be executed in EP mode (BRAKER2).
# Wed May 22 11:14:38 2024: Configuring of BRAKER for using external tools...
# Wed May 22 11:14:38 2024: Trying to set $AUGUSTUS_CONFIG_PATH...
# Wed May 22 11:14:38 2024: Found environment variable $AUGUSTUS_CONFIG_PATH.
# Wed May 22 11:14:38 2024: Checking /usr/share/augustus/config as potential path for $AUGUSTUS_CONFIG_PATH.
# Wed May 22 11:14:38 2024: Success! Setting $AUGUSTUS_CONFIG_PATH to /usr/share/augustus/config!
# Wed May 22 11:14:38 2024: WARNING: in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 1933
AUGUSTUS_CONFIG_PATH/species (in this case /usr/share/augustus/config/species) is not writeable. BRAKER will try to copy the AUGUSTUS config directory to a writeable location.
ERROR in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 5942
failed to execute: /lustre7/home/kosukesano/tools/braker_git_install/BRAKER/scripts/get_gc_content.py --sequences /home/kosukesano/gall/out.p_ctg.fa.sort.softmasked.fasta --print_sequence_length 1> /lustre7/home/kosukesano/tools/for_brakertest/share_of_augustus/braker/gc_content.out 2> /lustre7/home/kosukesano/tools/for_brakertest/share_of_augustus/braker/errors/gc_content.stderr!

/home/kosukesano/tools/for_brakertest/share_of_augustus/brakerを消してなかったせい?

試しに自分のディレクトリにコピーしたAUGUSTUS_CONFIG_pathを使ってみる

### share_brakertest.sh.e26123629

ERROR in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 1240
Failed to create direcotry /lustre7/home/kosukesano/tools/for_brakertest/share_of_augustus/braker/GeneMark-ES!

ちなみに、/lustre7/home/kosukesano/tools/for_brakertest/share_of_augustus/braker/errorsを見ると……

(braker) kosukesano@at138:/lustre7/home/kosukesano/tools/for_brakertest/share_of_augustus/braker/errors$ ls
find_python3_biopython.err  find_python3_re.err  gc_content.stderr

find_python3_biopython.errfind_python3_re.errには何も書いていなかった。

gc_content.stderrを見ると……

Traceback (most recent call last):
  File "/lustre7/home/kosukesano/tools/braker_git_install/BRAKER/scripts/get_gc_content.py", line 215, in <module>
    main()
  File "/lustre7/home/kosukesano/tools/braker_git_install/BRAKER/scripts/get_gc_content.py", line 52, in main
    text = seq_file.read(int(config['mem_size']))
MemoryError

↑のやつ、もしかして/lustre7/home/kosukesano/tools/for_brakertest/share_of_augustus/でやっていたから意味なかった?

改めて自分の/home/kosukesano/.../share_of_augustus/brakerファイルを消し、/usr/share/augustus/configにパスを通して再実行

0523

~/tools/for_brakertest/share_of_augustus/output_testを作成、その下でshare_brakertest.shqsub。ジョブIDは26124509

(結果の出力ファイルであるbrakerがすでに存在しているとエラーを吐くようなので、ディレクトリを移した)

### share_brakertest.sh の中身

#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 16
#$ -l s_vmem=16G
#$ -l mem_req=16G
echo start at
date

source /home/kosukesano/tools/pyenv_env/braker_profile

braker.pl --genome=/home/kosukesano/gall/out.p_ctg.fa.sort.softmasked.fasta --prot_seq=/home/kosukesano/tools/Arthropoda.fa --threads=16\
        --AUGUSTUS_CONFIG_PATH=/usr/share/augustus/config\
        --AUGUSTUS_BIN_PATH=/usr/bin\
        --AUGUSTUS_SCRIPTS_PATH=/usr/share/augustus/scripts\
        --GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes\
        --PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin\
        --TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin

date

ジョブ終わっていないけど出力ファイルやエラーファイルができてる。リアルタイムで書き込まれるっぽい?

0524

BRAKER終了!

ID26124509のジョブが終了。~/tools/for_brakertest/share_of_augustus/output_testの下にbrakerというディレクトリができた。

### ディレクトリbrakerの中身

kosukesano@at137:~/tools/for_brakertest/share_of_augustus/output_test$ ls braker/
Augustus  GeneMark-EP  GeneMark-ES  braker.aa  braker.codingseq  braker.gtf  braker.log  errors  genome_header.map  hintsfile.gff  prothint.gff  species  what-to-cite.txt

このうちbraker.gtfが最終産物。

### braker.gtf

ptg000001l_length_376496        AUGUSTUS        gene    37356   37715   .       +       .       g1
ptg000001l_length_376496        AUGUSTUS        transcript      37356   37715   0.7     +       .       g1.t1
ptg000001l_length_376496        AUGUSTUS        start_codon     37356   37358   .       +       0       transcript_id "g1.t1"; gene_id "g1";
ptg000001l_length_376496        AUGUSTUS        CDS     37356   37715   0.7     +       0       transcript_id "g1.t1"; gene_id "g1";
ptg000001l_length_376496        AUGUSTUS        exon    37356   37715   .       +       .       transcript_id "g1.t1"; gene_id "g1";
ptg000001l_length_376496        AUGUSTUS        stop_codon      37713   37715   .       +       0       transcript_id "g1.t1"; gene_id "g1";
ptg000001l_length_376496        AUGUSTUS        gene    77496   78040   .       +       .       g2
ptg000001l_length_376496        AUGUSTUS        transcript      77496   78040   0.83    +       .       g2.t1
ptg000001l_length_376496        AUGUSTUS        start_codon     77496   77498   .       +       0       transcript_id "g2.t1"; gene_id "g2";
.
.
.
.
.
.
ptg006399l_length_14628 AUGUSTUS        transcript      6869    7388    1       -       .       g44999.t1
ptg006399l_length_14628 AUGUSTUS        stop_codon      6869    6871    .       -       0       transcript_id "g44999.t1"; gene_id "g44999";
ptg006399l_length_14628 AUGUSTUS        CDS     6869    7221    1       -       2       transcript_id "g44999.t1"; gene_id "g44999";
ptg006399l_length_14628 AUGUSTUS        exon    6869    7221    .       -       .       transcript_id "g44999.t1"; gene_id "g44999";
ptg006399l_length_14628 AUGUSTUS        intron  7222    7354    1       -       .       transcript_id "g44999.t1"; gene_id "g44999";
ptg006399l_length_14628 AUGUSTUS        CDS     7355    7388    1       -       0       transcript_id "g44999.t1"; gene_id "g44999";
ptg006399l_length_14628 AUGUSTUS        exon    7355    7388    .       -       .       transcript_id "g44999.t1"; gene_id "g44999";
ptg006399l_length_14628 AUGUSTUS        start_codon     7386    7388    .       -       0       transcript_id "g44999.t1"; gene_id "g44999";
ptg006399l_length_14628 AUGUSTUS        gene    14340   14628   .       -       .       g45000
ptg006399l_length_14628 AUGUSTUS        transcript      14340   14628   1       -       .       g45000.t1
ptg006399l_length_14628 AUGUSTUS        stop_codon      14340   14342   .       -       0       transcript_id "g45000.t1"; gene_id "g45000";
ptg006399l_length_14628 AUGUSTUS        CDS     14340   14628   1       -       1       transcript_id "g45000.t1"; gene_id "g45000";
ptg006399l_length_14628 AUGUSTUS        exon    14340   14628   .       -       .       transcript_id "g45000.t1"; gene_id "g45000";
(END)

gene idはg45000まで?昆虫のゲノムとしては多い。

/braker/Augustus/以下にはAugustusのみの結果が出力されているみたい。seqkitを用いて遺伝子数などを確認してみる。seqkitは遺伝研のsingularityにあるものを使う。

### /braker/Augustus/の中身

kosukesano@at137:~/tools/for_brakertest/share_of_augustus/output_test/braker$ ls Augustus/
augustus.hints.aa  augustus.hints.codingseq  augustus.hints.gtf

### seqkitによる遺伝子数の確認

kosukesano@at137:~/tools/for_brakertest/share_of_augustus/output_test/braker$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat Augustus/augustus.hints.aa
file                        format  type     num_seqs     sum_len  min_len  avg_len  max_len
Augustus/augustus.hints.aa  FASTA   Protein    47,989  16,250,546        7    338.6   25,263

シーケンス数47,989braker単体よりも多い。また最小の長さが7と非常に短く、本来遺伝子ではない部分を余計にアノテーションしている?

新規マダラゲノムのソフトマスク

RepeatModeler_test.shが終了した。結局メモリ数48だと入らず、24にした。うまくいったスクリプトは以下。

### RepeatModeler_test.sh

#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 24
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date

source ~/tools/pyenv_env/EDTA_profile

RepeatModeler -database BLAST_DATABASE_PREFIX  -pa 6
date

出力ファイルRepeatModeler_test.sh.o26123564を見るとこんな感じ

### RepeatModeler_test.sh.o26123564の中身

start at
Wed May 22 10:50:55 JST 2024
RepeatModeler Version 2.0.1
===========================
Search Engine = rmblast 2.14.1+
Dependencies: TRF 4.09, RECON , RepeatScout 1.0.6, RepeatMasker 4.1.2
LTR Structural Analysis: Disabled [use -LTRStruct to enable]
Random Number Seed: 1716342701
Database = BLAST_DATABASE_PREFIX .
  - Sequences = 209
  - Bases = 1295393365
  - N50 = 54629423
  - Contig Histogram:
  Size(bp)                                                        Count
  -----------------------------------------------------------------------
  150024189-160739203 |                                                   [  ]
  139309176-150024189 |                                                   [  ]
  128594163-139309176 |
.
.
.
.
.
The RepeatModeler stockholm file is formatted so that it can
easily be submitted to the Dfam database.  Please consider contributing
curated families to this open database and be a part of this growing
community resource.  For more information contact help@dfam.org.


Fri May 24 05:00:48 JST 2024
(END)

大体2日くらいかかってる。

結果はRM_16988.WedMay221052072024というディレクトリに出力された。

### RM_16988.WedMay221052072024の中身

kosukesano@at137:~/tools/for_softmask$ ls RM_16988.WedMay221052072024/
consensi.fa  consensi.fa.classified  families-classified.stk  families.stk  round-1  round-2  round-3  round-4  round-5  round-6  tmpConsensi.fa

このうちconsensi.fa.classifiedというファイルが最終産物。これを使って次はRepeatMaskerを動かす。RepeatMasker実行スクリプトは以下の通り

### RepeatMasker実行スクリプトRepeatMasker_test.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 24
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date

source ~/tools/pyenv_env/EDTA_profile

RepeatMasker -pa 6 -lib ~/tools/for_softmask/RM_16988.WedMay221052072024/consensi.fa.classified  ~/tools/for_softmask/nama_data/231117_madaragenome.fasta
date

ホームディレクトリの整理

~/old_environment_until20240430を作り、localpyenv_conda_environmenttools以外のホームディレクトリ直下ディレクトリをそこに入れた。

### 整理後のホームディレクトリの様子

kosukesano@at137:~$ ls
local  old_envilonment_until20240430  pyenv_conda_environment  results_sh_eando  tools

### old_envilonment_until20240430の中身

kosukesano@at137:~$ ls old_envilonment_until20240430/
EDTA  GeMoMa_temp  busco_downloads  cafetest  gall  leaf_beetle  other_weevil  outgroup  paml_test  ronbun_sp
kosukesano@at137:~$ 

これによって従来のディレクトリ構造が変化したので注意!

フェモラータゲノムのソフトマスク

ローカルのHDDからフェモのゲノムデータSfem_assembly.fastaを持ってくる

### ローカル環境。scpでSfem_assembly.fastaを遺伝研環境にコピー。

/Volumes/Elements_1/sano/weevil_genome/femo_genome$ ls
2023.11.22.polished.annotated.genome    Sfem-1_1.fastq.gz                       Sfem-1_2.fastq.gz                       Sfem_assembly.fasta
/Volumes/Elements_1/sano/weevil_genome/femo_genome$ scp Sfem_assembly.fasta kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_softmask/nama_data
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
|  ..o.o...*   o+ |
|   . . ..= + o* o|
|       .  = oB +.|
|      +.oo .+E+o.|
|      .*S.  o.o.+|
|      .o. .  . .+|
|      ..   +  . o|
|        . ..oo . |
|         . .=.   |
+----[SHA256]-----+
Sfem_assembly.fasta                                                                                                                                          100%  481MB 105.8MB/s   00:04    
/Volumes/Elements_1/sano/weevil_genome/femo_genome$ 

~/tools/for_softmask/nama_dataに格納。

~/tools/for_softmask/下にSfemorata_softmaskディレクトリを構築。その中でフェモラータゲノムのソフトマスクを行う。

~/tools/for_softmask/Sfemorata_softmask下でBLASTデータベースを作成。データベース名はSfem_BLAST_DATABASE_PREFIXとした。


### EDTAの環境を立ち上げる

kosukesano@at137:~/tools/for_softmask/Sfemorata_softmask$ source ~/tools/pyenv_env/EDTA_profile

### BLASTデータベースの構築

(EDTA2) kosukesano@at137:~/tools/for_softmask/Sfemorata_softmask$ BuildDatabase -name Sfem_BLAST_DATABASE_PREFIX /home/kosukesano/tools/for_softmask/nama_data/Sfem_assembly.fasta 
Building database Sfem_BLAST_DATABASE_PREFIX:
  Reading /home/kosukesano/tools/for_softmask/nama_data/Sfem_assembly.fasta...
Number of sequences (bp) added to database: 5084 ( 495627753 bp )

### lsで見てみる

(EDTA2) kosukesano@at137:~/tools/for_softmask/Sfemorata_softmask$ ls
RepeatMasker_test.sh   Sfem_BLAST_DATABASE_PREFIX.nhr  Sfem_BLAST_DATABASE_PREFIX.njs  Sfem_BLAST_DATABASE_PREFIX.nni  Sfem_BLAST_DATABASE_PREFIX.nsq
RepeatModeler_test.sh  Sfem_BLAST_DATABASE_PREFIX.nin  Sfem_BLAST_DATABASE_PREFIX.nnd  Sfem_BLAST_DATABASE_PREFIX.nog  Sfem_BLAST_DATABASE_PREFIX.translation
(EDTA2) kosukesano@at137:~/tools/for_softmask/Sfemorata_softmask$ 

続いてRepeatModelerを実行する。ジョブスクリプトSfem_RepeatModeler.shは以下の通り。

### Sfem_RepeatModeler.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 24
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date

source ~/tools/pyenv_env/EDTA_profile

RepeatModeler -database Sfem_BLAST_DATABASE_PREFIX  -pa 6
date

0527

新規マダラゲノムのソフトマスク続き

~/tools/for_softmask/nama_data内でMadara_ProcessRepeats.shを作成。中身は以下の通り。

### Madara_ProcessRepeats.shの内容

#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 24
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date

source ~/tools/pyenv_env/EDTA_profile

ProcessRepeats -maskSource 231117_madaragenome.fasta -xsmall -gff 231117_madaragenome.fasta.cat.gz
date

最初の実行では-maskSourceの前に全角の空白がありエラー?手直しをしてもう一度qsub

ついでにフェモラータのゲノムもソフトマスクをした

### Sfem_ProcessRepeats.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 24
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date

source ~/tools/pyenv_env/EDTA_profile

ProcessRepeats -maskSource Sfem_assembly.fasta -xsmall -gff Sfem_assembly.fasta.cat.gz
date
~

BUSCOによるアノテーション後のマダラゲノムデータのクオリティ評価

昔ダウンロードしたODBデータと、singularityにあるBUSCOのツールを使って、BRAKERでアノテーションをつけたマダラのゲノムデータを評価した。

kosukesano@at137:~/tools/for_brakertest/share_of_augustus/output_test/braker$ singularity exec -e /usr/local/biotools/b/busco:5.1.3--pyhdfd78af_0 busco -m protein -i braker.aa -o OUTPUT -l ~/old_envilonment_until20240430/busco_downloads/busco_downloads/lineages/arthropoda_odb10/ -f
INFO:   ***** Start a BUSCO v5.1.3 analysis, current time: 05/27/2024 14:03:46 *****
INFO:   Configuring BUSCO with local environment
INFO:   Mode is proteins
INFO:   'Force' option selected; overwriting previous results directory
INFO:   Downloading information on latest versions of BUSCO data...
INFO:   Input file is /home/kosukesano/tools/for_brakertest/share_of_augustus/output_test/braker/braker.aa
INFO:   Using local lineages directory /home/kosukesano/old_envilonment_until20240430/busco_downloads/busco_downloads/lineages/arthropoda_odb10/
INFO:   Running BUSCO using lineage dataset  (eukaryota, 2024-01-08)
INFO:   ***** Run HMMER on gene sequences *****
INFO:   Running 1013 job(s) on hmmsearch, starting at 05/27/2024 14:03:50
INFO:   [hmmsearch]     102 of 1013 task(s) completed
INFO:   [hmmsearch]     203 of 1013 task(s) completed
INFO:   [hmmsearch]     304 of 1013 task(s) completed
INFO:   [hmmsearch]     406 of 1013 task(s) completed
INFO:   [hmmsearch]     507 of 1013 task(s) completed
INFO:   [hmmsearch]     608 of 1013 task(s) completed
INFO:   [hmmsearch]     710 of 1013 task(s) completed
INFO:   [hmmsearch]     811 of 1013 task(s) completed
INFO:   [hmmsearch]     912 of 1013 task(s) completed
INFO:   [hmmsearch]     1013 of 1013 task(s) completed
INFO:

        --------------------------------------------------
        |Results from dataset                             |
        --------------------------------------------------
        |C:88.3%[S:75.2%,D:13.1%],F:6.7%,M:5.0%,n:1013    |
        |895    Complete BUSCOs (C)                       |
        |762    Complete and single-copy BUSCOs (S)       |
        |133    Complete and duplicated BUSCOs (D)        |
        |68     Fragmented BUSCOs (F)                     |
        |50     Missing BUSCOs (M)                        |
        |1013   Total BUSCO groups searched               |
        --------------------------------------------------
INFO:   BUSCO analysis done. Total running time: 477 seconds
INFO:   Results written in /home/kosukesano/tools/for_brakertest/share_of_augustus/output_test/braker/OUTPUT
INFO:   For assistance with interpreting the results, please consult the userguide: https://busco.ezlab.org/busco_userguide.html

kosukesano@at137:~/tools/for_brakertest/share_of_augustus/output_test/braker$

結果:88%

まあまあ?

0528

RNA-seqデータを用いたBRAKERのテストラン

~/tools/for_brakertest/share_of_augustusの下にrnaplus_output_testディレクトリを作成。その下にRNA_brakertest.shold_madaragenome_softmasked.fasta(旧out.p_ctg.fa.sort.softmasked.fasta)を用意。またchanged_id_test_rnaディレクトリを作成。

### RNA_brakertest.shの中身

#$ -cwd
#$ -l gpu
#$ -pe def_slot 16
#$ -l s_vmem=16G
#$ -l mem_req=16G
echo start at
date

source /home/kosukesano/tools/pyenv_env/braker_profile

braker.pl --genome=/home/kosukesano/tools/for_brakertest/share_of_augustus/rnaplus_output_test/old_madaragenome_softmasked.fasta\
        --prot_seq=/home/kosukesano/tools/Arthropoda.fa\
        --rnaseq_sets_ids=adult-1_1,adult-1_2,adult-2_1,adult-2_2,adult-3_1,adult-3_2 \
        --rnaseq_sets_dir=/home/kosukesano/tools/for_brakertest/share_of_augustus/rnaplus_output_test/changed_id_test_rna\
        --threads=16\
        --species=Smadaranus\
        --AUGUSTUS_CONFIG_PATH=/usr/share/augustus/config\
        --AUGUSTUS_BIN_PATH=/usr/bin\
        --AUGUSTUS_SCRIPTS_PATH=/usr/share/augustus/scripts\
        --GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin\
        --PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin\
        --TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin

date

またchanged_id_test_rnaディレクトリを作成。

(braker) kosukesano@at137:~/tools/for_brakertest/share_of_augustus/rnaplus_output_test$ ls changed_id_test_rna/
adult-1_1.fastq  adult-1_2.fastq  adult-2_1.fastq  adult-2_2.fastq  adult-3_1.fastq  adult-3_2.fastq
(braker) kosukesano@at137:~/tools/for_brakertest/share_of_augustus/rnaplus_output_test$

最初に以下のエラーが出た.

### RNA_brakertest.sh.e26144319の一部抜粋

# Tue May 28 11:37:09 2024: Trying to set $GENEMARK_PATH...
# Tue May 28 11:37:09 2024: Found command line argument $GENEMARK_PATH.
# Tue May 28 11:37:09 2024: Checking /home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes as potential path for $GENEMARK_PATH.
#*********
# WARNING: Couldn't find gmetp.pl in /home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes. Will not set $GENEMARK_PATH to /home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes!
#*********
# Tue May 28 11:37:09 2024: Did not find environment variable $GENEMARK_PATH.
# Tue May 28 11:37:09 2024: ERROR: in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 1834
$GENEMARK_PATH not set!

/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/binを見てみるとgmetp.plはあったが、コマンド上ではGeneMark-ETP/bin/gmesを指定していたのでエラーが起きたっぽい。

オプションにはGeneMark-ETP/bin/gmesを直接指定するものは無い。とはいえGeneMark-ETP/bin/gmesも重要だったはず。よってコマンド上では上記の/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/binを指定するようにし、braker_profileのほうでそれぞれのパスを通すように変更。

### braker_profileの中身。GeneMark-ETP関連のPATHを開通させた。

#################################

export PATH="/home/kosukesano/tools/braker_git_install/BRAKER/scripts:$PATH"
export PATH="/usr/share/augustus/config:$PATH"
#export PATH="~/tools/All_AUGUSTUS_test/augustus/config:$PATH"
export PATH="/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin:$PATH"
export PATH="/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes:$PATH"
export PATH="/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmst:$PATH"
#export PATH="/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin:$PATH"
#export PATH="/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin:$PATH"

#################################

うまく動いてそう。

マダラゲノム、フェモゲノムのソフトマスクの結果

  • マダラゲノム:231117_madaragenome.fasta.masked
  • フェモゲノム:Sfem_assembly.fasta.masked

どちらもソフトマスクまで完了!

BRAKERの本番ラン準備

~/tools/for_braker/nama_dataを作成。そこに上記のマダラゲノムとフェモゲノムをコピー。また名前をそれぞれ231117_Madara_softmasked.fastaSfem_softmasked.fastaに変更。またMadara_RNAseqSfemo_RNAseqディレクトリを作成し、その下にRNAデータをコピー。

0529

ソフトマスク後のフェモラータゲノムのBUSCO

### BUSCO_OUTPUT_FEMO/short_summary.specific..BUSCO_OUTPUT_FEMO.txtの中身

# BUSCO version is: 5.1.3 
# The lineage dataset is:  (Creation date: 2024-01-08, number of genomes: 90, number of BUSCOs: 1013)
# Summarized benchmarking in BUSCO notation for file /home/kosukesano/tools/for_softmask/nama_data/Sfem_assembly.fasta.masked
# BUSCO was run in mode: genome
# Gene predictor used: metaeuk

        ***** Results: *****

        C:98.9%[S:97.8%,D:1.1%],F:0.4%,M:0.7%,n:1013       
        1002    Complete BUSCOs (C)                        
        991     Complete and single-copy BUSCOs (S)        
        11      Complete and duplicated BUSCOs (D)         
        4       Fragmented BUSCOs (F)                      
        7       Missing BUSCOs (M)                         
        1013    Total BUSCO groups searched                

Dependencies and versions:
        hmmsearch: 3.1
        metaeuk: 4.a0f584d

98%、非常に高い

BRAKERの本番ラン

### madara_braker.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 16
#$ -l s_vmem=16G
#$ -l mem_req=16G
echo start at
date

source /home/kosukesano/tools/pyenv_env/braker_profile

braker.pl --genome=/home/kosukesano/tools/for_braker/nama_data/231117_Madara_softmasked.fasta\
        --prot_seq=/home/kosukesano/tools/Arthropoda.fa\
        --rnaseq_sets_ids=adult-1_1,adult-1_2,adult-2_1,adult-2_2,adult-3_1,adult-3_2,\
        body-1_1,body-1_2,body-2_1,body-2_2,body-3_1,body-3_2,\
        large-larva-1_1,large-larva-1_2,large-larva-2_1,large-larva-2_2,large-larva-3_1,large-larva-3_2,\
        middle-larva-1_1,middle-larva-1_2,middle-larva-2_1,middle-larva-2_2,middle-larva-3_1,middle-larva-3_2,\
        ovary-1_1,ovary-1_2,ovary-2_1,ovary-2_2,ovary-3_1,ovary-3_2 \
        --rnaseq_sets_dir=/home/kosukesano/tools/for_braker/nama_data/Madara_RNAseq\
        --threads=16\
        --species=Smadaranus\
        --AUGUSTUS_CONFIG_PATH=/usr/share/augustus/config\
        --AUGUSTUS_BIN_PATH=/usr/bin\
        --AUGUSTUS_SCRIPTS_PATH=/usr/share/augustus/scripts\
        --GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin\
        --PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin\
        --TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin

date

0531

RNA-seqデータを用いたBRAKERのテストラン結果

braker.gtfが出力され、最後まで動作した。

### 出力ディレクトリbrakerの中身
kosukesano@at139:~/tools/for_brakertest/share_of_augustus/rnaplus_output_test$ ls braker/
Augustus  GeneMark-ETP  braker.aa  braker.codingseq  braker.gtf  braker.log  errors  genome_header.map  hintsfile.gff  species  what-to-cite.txt
kosukesano@at139:~/tools/for_brakertest/share_of_augustus/rnaplus_output_test$ cd braker/

### augustus.hints.aaの要約
kosukesano@at139:~/tools/for_brakertest/share_of_augustus/rnaplus_output_test/braker$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat Augustus/augustus.hints.aa
file                        format  type     num_seqs     sum_len  min_len  avg_len  max_len
Augustus/augustus.hints.aa  FASTA   Protein    33,364  13,441,582       10    402.9   27,212
kosukesano@at139:~/tools/for_brakertest/share_of_augustus/rnaplus_output_test/braker$

Augustusだけだと遺伝子は3万くらい

ptg006399l_length_14628 AUGUSTUS        stop_codon      6869    6871    .       -       0       transcript_id "g13066.t1"; gene_id "g13066";
ptg006399l_length_14628 AUGUSTUS        CDS     6869    7171    1       -       0       transcript_id "g13066.t1"; gene_id "g13066";
ptg006399l_length_14628 AUGUSTUS        exon    6869    7171    .       -       .       transcript_id "g13066.t1"; gene_id "g13066";
ptg006399l_length_14628 AUGUSTUS        start_codon     7169    7171    .       -       0       transcript_id "g13066.t1"; gene_id "g13066";
ptg006399l_length_14628 AUGUSTUS        gene    14340   14555   .       -       .       g13067
ptg006399l_length_14628 AUGUSTUS        transcript      14340   14555   1       -       .       g13067.t1
ptg006399l_length_14628 AUGUSTUS        stop_codon      14340   14342   .       -       0       transcript_id "g13067.t1"; gene_id "g13067";
ptg006399l_length_14628 AUGUSTUS        CDS     14340   14555   1       -       0       transcript_id "g13067.t1"; gene_id "g13067";
ptg006399l_length_14628 AUGUSTUS        exon    14340   14555   .       -       .       transcript_id "g13067.t1"; gene_id "g13067";
ptg006399l_length_14628 AUGUSTUS        start_codon     14553   14555   .       -       0       transcript_id "g13067.t1"; gene_id "g13067";

braker全体では13067個の遺伝子が取れた。 #### 出力ファイルの一つであるbraker.aabuscoにかけてみた。 以下のスクリプトを用いてbuscoを実行

### buscoにかけるジョブスクリプトMadara_busco.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 12
echo start at
date


date
singularity exec -e /usr/local/biotools/b/busco:5.1.3--pyhdfd78af_0 busco\
        -m protein\
        -i braker.aa\
        -o BUSCO_OUTPUT_MADARA_ANNOTATED_AdultRNA\
        -l\
        /home/kosukesano/old_envilonment_until20240430/busco_downloads/busco_downloads/lineages/arthropoda_odb10/\
        -f
~

結果

# BUSCO version is: 5.1.3 
# The lineage dataset is:  (Creation date: 2024-01-08, number of genomes: 90, number of BUSCOs: 1013)
# Summarized benchmarking in BUSCO notation for file /home/kosukesano/tools/for_brakertest/share_of_augustus/rnaplus_output_test/braker/braker.aa
# BUSCO was run in mode: proteins

        ***** Results: *****

        C:91.8%[S:77.1%,D:14.7%],F:2.5%,M:5.7%,n:1013      
        930     Complete BUSCOs (C)                        
        781     Complete and single-copy BUSCOs (S)        
        149     Complete and duplicated BUSCOs (D)         
        25      Fragmented BUSCOs (F)                      
        58      Missing BUSCOs (M)                         
        1013    Total BUSCO groups searched                

Dependencies and versions:
        hmmsearch: 3.1

91.8%!?めっちゃ高いじゃん!

BRAKERの本番ランについて

投げるノードをintelに変更。より入りやすく。

また、RNA-seqデータがない時のBUSCO値を見たかったので、OnlyProtein_madaraOnlyProtein_femoというディレクトリを作り、プロテインデータだけ渡してbrakerを実行させた。

旧型マダラゲノムのBUSCO

旧マダラゲノムのBUSCO値がわからなくなったので、改めてBUSCOにかける。

2024年6月

0603

結果まとめ

  • 新規マダラゲノム
    • タンパク質リファレンスBRAKER
      • BUSCO値:89.8%
      • 遺伝子数:41802
    • タンパク質+RNAseqリファレンスBRAKER
      • BUSCO値:96.8%
      • 遺伝子数:13653
  • 旧マダラゲノム
    • タンパク質リファレンスBRAKER
      • BUSCO値:88.3%
      • 遺伝子数:45000
    • タンパク質+RNAseqリファレンスBRAKER
      • BUSCO値:91.8%
      • 遺伝子数:13067
  • フェモラータゲノム
    • タンパク質リファレンスBRAKER
      • BUSCO値:92.0%
      • 遺伝子数:16856
    • タンパク質+RNAseqリファレンスBRAKER
      • BUSCO値:73.2%
      • 遺伝子数:10818

ちなみに新規マダラゲノム全体でかかった解析時間は127.5時間だった。

0617

コフキゲノムのソフトマスク

#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 48
echo start at
date

source ~/tools/pyenv_env/EDTA_profile

RepeatModeler -database Kohuki_data -pa 12
date

blastpを用いた機能アノテーション、その前準備のレファレンス作成

~/reference_sequence/Sory_Tcas_Dmel_Ecol_refを作成。レファレンスに使う種は

  • 大腸菌Escherichia coli
  • ショウジョウバエDrosophila melanogaster
  • コクヌストモドキTribolium castaneum
  • ココクゾウムシSitophilus oryzae

とした。この4種のprotein.faaを上記ディレクトリにコピー。

レファレンスとするにはこれらを1つのファイルへと結合しなければならないが、それぞれの遺伝子がどの種由来かわからなくなる。そこでperlスクリプトによりヘッダーの頭に種名を加えるよう加工した。

### ヘッダー加工に使ったperlスクリプト add_hoge_to_headers.plの中身

#!/usr/bin/perl

use strict;
use warnings;
use File::Copy qw(move);

# 入力ファイルと一時ファイルの定義
my $input_fasta = 'Dmel_protein.fasta';
my $temp_fasta = 'temp.fasta';

# 入力ファイルを開く
open(my $in, '<', $input_fasta) or die "Cannot open $input_fasta: ";
# 一時ファイルを開く
open(my $out, '>', $temp_fasta) or die "Cannot open $temp_fasta: ";

# 行ごとに処理
while (my $line = <$in>) {
    if ($line =~ /^>/) {
        # ヘッダー行にhoge_を追加
        $line =~ s/^>(.*)/>Dmel_/;
    }
    print $out $line;
}

# ファイルを閉じる
close($in);
close($out);

置き換えたいファイルの名前を$input_fastaに指定。

実行の際は以下の通り

kosukesano@at139:~/reference_sequence/Sory_Tcas_Dmel_Ecol_ref$ chmod +x add_hoge_to_headers.pl 
kosukesano@at139:~/reference_sequence/Sory_Tcas_Dmel_Ecol_ref$ ./add_hoge_to_headers.pl

手を加えたファイルを結合。

kosukesano@at139:~/reference_sequence/Sory_Tcas_Dmel_Ecol_ref$ cat Dmel_protein.fasta Ecol_protein.fasta Sory_protein.fasta Tcas_protein.fasta > merge_4sp.faa

データベースの構築

kosukesano@at139:~/reference_sequence/Sory_Tcas_Dmel_Ecol_ref$ singularity exec -e /usr/local/biotools/b/blast:2.9.0--pl526h979a64d_3 makeblastdb -in merge_4sp.faa -out merge_4sp -dbtype prot -hash_index
WARNING: Skipping mount /opt/pkg/singularity-ce/4.0.0/var/singularity/mnt/session/etc/resolv.conf [files]: /etc/resolv.conf doesn't exist in container


Building a new DB, current time: 06/17/2024 16:46:13
New DB name:   /home/kosukesano/reference_sequence/Sory_Tcas_Dmel_Ecol_ref/merge_4sp
New DB title:  merge_4sp.faa
Sequence type: Protein
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 82065 sequences in 4.80911 seconds.
kosukesano@at139:~/reference_sequence/Sory_Tcas_Dmel_Ecol_ref$ ls
Dmel_protein.fasta  Sory_protein.fasta  add_hoge_to_headers.pl  merge_4sp.phd  merge_4sp.phr  merge_4sp.pog  merge_4sp.psi
Ecol_protein.fasta  Tcas_protein.fasta  merge_4sp.faa           merge_4sp.phi  merge_4sp.pin  merge_4sp.psd  merge_4sp.psq
kosukesano@at139:~/reference_sequence/Sory_Tcas_Dmel_Ecol_ref$ 

blast用のシェルスクリプト準備

### blastp_4sp_test.shの中身

#!/bin/bash
#$ -S /bin/sh
#$ -pe def_slot 8
#$ -l s_vmem=64G,mem_req=64G
#$ -cwd
#$ -o ~/results_sh_eando
#$ -e ~/results_sh_eando

echo "pwd: $(pwd)"
echo HOME: $HOME
echo USER: $USER
echo JOB_ID: $JOB_ID

echo starting at
date

#BLASTの標準列名を定義

header="qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore"

#出力ファイルの定義

output_file="out_madara_blastp_test.txt"

#列名を出力ファイルに書き込む

echo "$header" > $output_file

#BLASTpを実行し、結果を追加する

singularity exec --bind /usr/local/seq /usr/local/biotools/b/blast:2.9.0--pl526h979a64d_3 blastp \
-query ${HOME}/reference_sequence/madara_protein.fasta \
-db ${HOME}/reference_sequence/Sory_Tcas_Dmel_Ecol_ref/merge_4sp \
-evalue 1e-04 \
-outfmt 6 >> $output_file

echo ending at
date
Singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit seqkit sort --quiet -l conitg.fa Sfem_softmasked.fasta | seqkit fx2tab -l -n -i -H > length.txt

0618

blastpの結果

### out_madara_blastp_testの中身

qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore
g704.t1 Sory_XP_030750051.1     82.587  201     34      1       1       201     1       200     8.77e-99        336
g704.t1 Sory_XP_030750049.1     82.587  201     34      1       1       201     1       200     2.24e-98        335
g704.t1 Sory_XP_030750048.1     82.587  201     34      1       1       201     1       200     2.85e-98        336
g704.t1 Sory_XP_030750047.1     82.587  201     34      1       1       201     1       200     2.85e-98        336
g704.t1 Tcas_XP_973129.2        69.268  205     56      1       1       198     1       205     3.66e-88        300
g704.t1 Tcas_XP_973129.2        42.857  217     89      5       873     1076    459     653     9.64e-14        77.4
g704.t1 Dmel_NP_001401025.1     55.238  210     74      3       1       196     1       204     3.49e-67        228
g704.t1 Dmel_NP_647642.2        52.655  226     85      4       1       210     1       220     1.63e-65        242
g704.t1 Dmel_NP_001246548.1     52.655  226     85      4       1       210     1       220     1.63e-65        242
g704.t1 Dmel_NP_001400988.1     52.655  226     85      4       1       210     1       220     2.69e-65        241
g704.t1 Dmel_NP_728652.1        52.655  226     85      4       1       210     1       220     3.76e-65        241
g704.t1 Dmel_NP_001401027.1     52.655  226     85      4       1       210     1       220     5.79e-65        241
g704.t1 Dmel_NP_001401026.1     52.655  226     85      4       1       210     1       220     1.06e-64        240
g704.t1 Dmel_NP_001097475.2     56.140  114     49      1       1       114     1       113     6.41e-34        144
g704.t1 Sory_XP_030750050.1     86.364  88      12      0       114     201     89      176     4.61e-31        134
g704.t1 Sory_XP_030750053.1     80.851  94      16      1       108     201     2       93      8.27e-31        132
g704.t1 Sory_XP_030750052.1     86.207  87      12      0       115     201     14      100     1.59e-30        132
g704.t1 Dmel_NP_001286903.1     52.778  108     32      2       103     196     5       107     2.62e-25        105
g704.t1 Tcas_XP_008198255.2     61.957  92      28      1       114     198     71      162     1.21e-22        105
g704.t1 Tcas_XP_008198255.2     45.622  217     83      4       873     1076    416     610     1.20e-13        77.0
g704.t1 Dmel_NP_728653.2        46.980  149     55      4       103     232     5       148     2.12e-22        96.7
g704.t1 Tcas_XP_008198256.2     61.538  91      28      1       115     198     11      101     4.04e-22        103
g704.t1 Tcas_XP_008198256.2     45.622  217     83      4       873     1076    355     549     4.51e-14        78.2
g706.t1 Sory_XP_030750036.1     85.816  423     55      4       1       423     1       418     0.0     746

できてる!

レファレンスをマダラに、クエリーをその他4種にしたblastp

データベースの構築

kosukesano@at139:~/reference_sequence/Madara$ singularity exec -e /usr/local/biotools/b/blast:2.9.0--pl526h979a64d_3 makeblastdb -in madara_protein.fasta -out madara_ref -dbtype prot -hash_in
dex
WARNING: Skipping mount /opt/pkg/singularity-ce/4.0.0/var/singularity/mnt/session/etc/resolv.conf [files]: /etc/resolv.conf doesn't exist in container


Building a new DB, current time: 06/18/2024 12:51:23
New DB name:   /home/kosukesano/reference_sequence/Madara/madara_ref
New DB title:  madara_protein.fasta
Sequence type: Protein
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 16570 sequences in 0.594244 seconds.
kosukesano@at139:~/reference_sequence/Madara$ ls

blast用のシェルスクリプト準備

### blastp_RefAsMadara.shの中身

#!/bin/bash
#$ -S /bin/sh
#$ -pe def_slot 8
#$ -l s_vmem=64G,mem_req=64G
#$ -cwd
#$ -o ~/results_sh_eando
#$ -e ~/results_sh_eando

echo "pwd: $(pwd)"
echo HOME: $HOME
echo USER: $USER
echo JOB_ID: $JOB_ID

echo starting at
date

#BLASTの標準列名を定義

header="qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore"

#出力ファイルの定義

output_file="out_madara_as_ref_blastp_.txt"

#列名を出力ファイルに書き込む

echo "$header" > $output_file

#BLASTpを実行し、結果を追加する

#ココクゾウムシのblastp
singularity exec --bind /usr/local/seq /usr/local/biotools/b/blast:2.9.0--pl526h979a64d_3 blastp \
-query ${HOME}/reference_sequence/Sory_Tcas_Dmel_Ecol_ref/Sory_protein.fasta \
-db ${HOME}/reference_sequence/Madara/madara_ref \
-evalue 1e-04 \
-outfmt 6 \
-out out_Sory_blastp_RefAsMadara.txt

#コクヌストモドキのblastp
singularity exec --bind /usr/local/seq /usr/local/biotools/b/blast:2.9.0--pl526h979a64d_3 blastp \
-query ${HOME}/reference_sequence/Sory_Tcas_Dmel_Ecol_ref/Tcas_protein.fasta \
-db ${HOME}/reference_sequence/Madara/madara_ref \
-evalue 1e-04 \
-outfmt 6 \
-out out_Tcas_blastp_RefAsMadara.txt

#ショウジョウバエのblastp
singularity exec --bind /usr/local/seq /usr/local/biotools/b/blast:2.9.0--pl526h979a64d_3 blastp \
-query ${HOME}/reference_sequence/Sory_Tcas_Dmel_Ecol_ref/Dmel_protein.fasta \
-db ${HOME}/reference_sequence/Madara/madara_ref \
-evalue 1e-04 \
-outfmt 6 \
-out out_Dmel_blastp_RefAsMadara.txt

#大腸菌のblastp
singularity exec --bind /usr/local/seq /usr/local/biotools/b/blast:2.9.0--pl526h979a64d_3 blastp \
-query ${HOME}/reference_sequence/Sory_Tcas_Dmel_Ecol_ref/Ecol_protein.fasta \
-db ${HOME}/reference_sequence/Madara/madara_ref \
-evalue 1e-04 \
-outfmt 6 \
-out out_Ecol_blastp_RefAsMadara.txt

echo ending at
date

マダラの種名をヘッダーにするの忘れてた…..

0620

コフキゲノムソフトマスクの続き

kosukesano@at139:~/tools/for_softmask/kohuki_softmask/RM_20252.MonJun171354072024$ ls
consensi.fa  consensi.fa.classified  families-classified.stk  families.stk  round-1  round-2  round-3  round-4  round-5  round-6  tmpConsensi.fa
kosukesano@at139:~/tools/for_softmask/kohuki_softmask/RM_20252.MonJun171354072024$ 

consensi.fa.classifiedが出力され、RepeatModelerは終了した。

続いてRepeatMaskerを動かす

### Kohuki_RepeatMasker.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 24
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date

source ~/tools/pyenv_env/EDTA_profile

RepeatMasker -pa 6 -lib ~/tools/for_softmask/kohuki_softmask/RM_20252.MonJun171354072024/consensi.fa.classified  /home/kosukesano/tools/for_softmask/kohuki_softmask/180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta
date

これをqsub_betaで投げた

0625

遺伝研の緊急メンテナンスが修了!

0620のKohuki_RepeatMasker.shをもう一度投げた。

Orthofinderのテスト

~/tools/for_orthofinderディレクトリを作成。その中でSmad_Agra_Cass_Dpon_Sory_Tcas_fasta_dirディレクトリを作成し、以下の6種のタンパク質ファイル(.fasta)を入れた。

  • Smycronyx madaranus(マダラケシツブゾウムシ)
  • Anthonomus grandis grandis(ワタミハナゾウムシ)
  • Dendroctonus ponderosae(マツノキクイムシ)
  • Sitophilus oryzae(ココクゾウムシ)
  • Ceutorhynchus assimilis(キャベツサヤゾウムシ)
  • Tribolium castaneum(コクヌストモドキ)
kosukesano@at139:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir$ ls
Agra.fasta  Cass.fasta  Dpon.fasta  OrthoFinder  Smad.fasta  Sory.fasta  Tcas.fasta

また、~/tools/for_orthofinder/直下にOrthofinder実行シェルスクリプトSmad_Agra_Cass_Dpon_Sory_Tcas_orthotest.shを作成した。

#$ -S /bin/bash
#$ -cwd
#$ -pe def_slot 5
#$ -l medium
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date


singularity exec /usr/local/biotools/o/orthofinder:2.5.4--hdfd78af_0 orthofinder -f ~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir -t 5 -a 5

date

これをqsubで投げた。

結果

kosukesano@at139:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25$ ls
Citation.txt                     Gene_Duplication_Events  Log.txt               Orthogroups  Phylogenetic_Hierarchical_Orthogroups  Putative_Xenologs    Single_Copy_Orthologue_Sequences  WorkingDirectory
Comparative_Genomics_Statistics  Gene_Trees               Orthogroup_Sequences  Orthologues  Phylogenetically_Misplaced_Genes       Resolved_Gene_Trees  Species_Tree
kosukesano@at139:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25$ 

出力ファイルが全て揃った!実行できた!

0627

CAFEの前処理

~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/以下を全てローカルに転送した。

ローカルでは~/bio/for_cafe/Original_dataを作り、上記ディレクトリを格納した。

Rstudioで以下のコードを実行した。


Orthologs_raw <- read_tsv(paste("Original_data/OrthoFinder/Results_Jun25/Orthogroups/Orthogroups.GeneCount.tsv", sep = "/"))

##Enzanはorthogroupのなかで遺伝子数が変なやつを検出するためのmatrix
Enzan <- Orthologs_raw %>%
  select(!c(Orthogroup, Total)) %>%
  t()

##saidai, saisyouは各Orthogroupの中で、各種が持っているコピー数の最大値及び最小値を記したdf
saidai <- Enzan %>% 
  apply(2, max) %>%
  as.data.frame() %>%
  rename(max_real = ".")
saisyou <- Enzan %>% 
  apply(2, min) %>%
  as.data.frame() %>%
  rename(min_real = ".")

##Orthologs_1は各Orthogroupsの最大値、最小値もくっつけたdf
Orthologs_1 <- Orthologs_raw %>% select(!c(Total)) %>%
  bind_cols(saidai, saisyou)

##最大値と最小値の差
Orthologs_2 <-Orthologs_1 %>% 
  mutate(sa = max_real - min_real) %>%
  filter(max_real != min_real) %>%
  filter(sa < 50)


##外れ値と遺伝子ファミリー数が全種で共通の行を省いた。最後に1列目を複製し列名をいじって、CAFEへのインプットデータの出来上がり。
Orthologs_3 <- Orthologs_2 %>% 
  mutate(Description = Orthogroup, ID = Orthogroup) %>%
  relocate(Description, ID) %>%
  select(!c(Orthogroup, max_real, min_real, sa))

Orthologs_3 %>% 
  write_tsv(paste("Processed_data/Orthogroups.GeneCount2.tsv", sep = "/"))#, quote = FALSE) #,row.names = FALSE)
##Did you finish creating ultrametric tree with makeultrametric.R?

############

tree = read.tree("Original_data/OrthoFinder/Results_Jun25/Species_Tree/SpeciesTree_rooted.txt")
mrca = getMRCA(tree, tip=c('Tcas', 'Sory')) #分岐年代推定に使うノードの指定
tree2 = chronopl(
  tree,
  100000,
  age.min = 152.3,  # 推定分岐年代の最小値(MYA)
  age.max = 236.2,  # 推定分岐年代の最大値(MYA)
  node = mrca,   # getMRCAで指定したノード
  S = 1,
  tol = 1e-20,
  CV = FALSE,
  eval.max = 500,
  iter.max = 500
)
is.ultrametric(tree2)  # ultrametricかどうか確認
write.tree(tree2, file = "tree_ultrametric.nwk")  # ultrametric系統樹の保存

こうしてできたOrthogroups.GeneCount2.tsvtree_ultrametric.nwkをDDBJの~/tools/for_cafe/madara_4weevil_Tcas_cafetestに転送した。

DDBJの~/tools/for_cafe/madara_4weevil_Tcas_cafetestにて、CAFE5を実行した。

kosukesano@at138:~/tools/for_cafe/madara_4weevil_Tcas_cafetest$ singularity exec -e /usr/local/biotools/c/cafe:5.0.0--h5b5514e_2 cafe5 -i Orthogroups.GeneCount2.tsv -t tree_ultrametric.nwk 

Command line: /usr/local/bin/cafe5 -i Orthogroups.GeneCount2.tsv -t tree_ultrametric.nwk 

Filtering families not present at the root from: 12784 to 8037

No root family size distribution specified, using uniform distribution

Optimizer strategy: Nelder-Mead with similarity cutoff
Iterations: 300
Expansion: 2
Reflection: 1

Starting Search for Initial Parameter Values
Lambda: 0.001938743751488
Score (-lnL): 112660.90445756
Lambda: 0.001938743751488
Score (-lnL): 112660.90445756
Lambda: 0.0020356809390624
Score (-lnL): 112528.29246004
Lambda: 0.0021326181266368
Score (-lnL):  112446.2150052
Lambda: 0.0022295553142112
Score (-lnL): 112408.34055546
Lambda: 0.00242342968936
.
.
.
.
.
.
.
(省略)
.
.
.

Completed 20 iterations
Time: 0H 0M 2S
Best match is: 0.0022734326447198
Final -lnL: 112404.42731051

42 values were attempted (0% rejected)

Inferring processes for Base model
Score (-lnL): 112404.42731051
Maximum possible lambda for this topology: 0.004233700254022
Computing pvalues...
done!

Starting reconstruction processes for Base model
Done!

kosukesano@at138:~/tools/for_cafe/madara_4weevil_Tcas_cafetest$

結果

kosukesano@at138:~/tools/for_cafe/madara_4weevil_Tcas_cafetest$ ls
Orthogroups.GeneCount2.tsv  results  tree_ultrametric.nwk
kosukesano@at138:~/tools/for_cafe/madara_4weevil_Tcas_cafetest$ ls results/
Base_asr.tre  Base_branch_probabilities.tab  Base_change.tab  Base_clade_results.txt  Base_count.tab  Base_family_likelihoods.txt  Base_family_results.txt  Base_results.txt

ちゃんとファイルが出力された!

0628

コフキゲノムのソフトマスク結果

kosukesano@at138:~/tools/for_softmask/kohuki_softmask$ ls RM_32208.TueJun251102422024/
180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta                                                    180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta_batch-45387.masked
180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta.cat.all.gz                                         180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta_batch-45387.tmp.simple1
180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta_batch-45383.cat                                    180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta_batch-45388.masked
180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta_batch-45383.masked                                 180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta_batch-45388.tmp.simple1
180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta_batch-45384.cat                                    consensi.fa.classified.ndb
180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta_batch-45384.masked                                 consensi.fa.classified.nhr
180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta_batch-45385.cat                                    consensi.fa.classified.nin
180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta_batch-45385.masked                                 consensi.fa.classified.njs
180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta_batch-45386.masked                                 consensi.fa.classified.not
180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta_batch-45386.masked.2.3.5.75.20.33.7.summary.html   consensi.fa.classified.nsq
180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta_batch-45386.masked.s1.2.3.5.75.20.33.7.1.html      consensi.fa.classified.ntf
180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta_batch-45386.masked.s1.2.3.5.75.20.33.7.1.txt.html  consensi.fa.classified.nto
180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta_batch-45386.masked.s3.2.3.5.75.20.33.7.1.html      makeblastdb.log
180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta_batch-45386.masked.s3.2.3.5.75.20.33.7.1.txt.html  ncResults-1719541376-36687.err
180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta_batch-45386.masked.s4.2.3.5.75.20.33.7.txt.html    ncResults-1719541376-36687.out
180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta_batch-45386.tmp.custom                             trfResults-1719541376-29544.err
180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta_batch-45386.tmp.simple1                            trfResults-1719541377-29544.out
kosukesano@at138:~/tools/for_softmask/kohuki_softmask$ 

途中で終わっているみたい。メモリ不足?

Reciprocal Best Hitの探索

### rbh.pyの中身

# -*- coding: utf-8 -*-
import pandas as pd
engine='python'

# forward BLAST結果の読み込み
forward = pd.read_csv('out_madara_blastp_test.txt', sep='\s+', header=None, low_memory=False)
forward.columns = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore']
dtypes = {
    'qseqid': str,
    'sseqid': str,
    'pident': float,
    'length': int,
    'mismatch': int,
    'gapopen': int,
    'qstart': int,
    'qend': int,
    'sstart': int,
    'send': int,
    'evalue': float,
    'bitscore': float
}

# 逆BLAST結果の読み込みと処理
reverse_files = ['out_Sory_blastp_RefAsMadara.txt', 'out_Tcas_blastp_RefAsMadara.txt', 'out_Dmel_blastp_RefAsMadara.txt', 'out_Ecol_blastp_RefAsMadara.txt']
rbh_hits = []

for reverse_file in reverse_files:
    reverse = pd.read_csv(reverse_file, sep='\s+', header=None, low_memory=False)
    reverse.columns = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore']

    # RBHの判定
    for idx, row in forward.iterrows():
        query, subject = row['qseqid'], row['sseqid']
        reverse_hit = reverse[(reverse['qseqid'] == subject) & (reverse['sseqid'] == query)]
        if not reverse_hit.empty:
            rbh_hits.append(row)

# RBH結果の保存
rbh_df = pd.DataFrame(rbh_hits)
rbh_df.to_csv('reciprocal_best_hits_madara.txt', sep='\s+', index=False)
~
### rbh.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 24

echo "pwd: $(pwd)"
echo HOME: $HOME
echo USER: $USER
echo JOB_ID: $JOB_ID

echo starting at
date

python /home/kosukesano/reference_sequence/rbh.py

echo ending at
date

注意

  • ノードはgpuを指定すること。mediumにはpandasが入っていない。
  • メモリは12以上を指定すること。6だとqwのままランしない。

Orthofinderのアウトプットから種の系統樹を構築する

遺伝研スパコンの~/tools/for_orthofinder/下にmake_philo_treeというディレクトリを作成し、以下のスクリプトfasta_concatinate.shを作成した。

### fasta_concatinate.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -pe def_slot 5
#$ -l medium
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date

# Enter the directory containing the fasta files
filesout="/home/kosukesano/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir"  ## Please replace with the actual directory containing the fasta files

# Define the output directory and output file
new="output_directory"
mkdir -p $new

# Concatenate all fasta files into one file
for file in "$filesout"/*.fasta; do
    cat "$file" >> "./${new}/all_seq.fa"
done


date

これを作業ノードで実行権限を付与して実行した。

kosukesano@at138:~/tools/for_orthofinder/make_philo_tree$ chmod +x fasta_concatinate.sh
kosukesano@at138:~/tools/for_orthofinder/make_philo_tree$ ./fasta_concatinate.sh
start at
Fri Jun 28 17:13:55 JST 2024
Fri Jun 28 17:13:56 JST 2024
kosukesano@at138:~/tools/for_orthofinder/make_philo_tree$

output_directory/all_seq.faに全ての.fastaファイルをConcatinateしたファイルができた。

2024年7月

0701

コフキゲノムのRepeatMasker結果

0628に再度投げたジョブが終わっていた。~/tools/for_softmask/kohuki_softmask180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta.cat.gzができたので成功!

上手く行ったスクリプトは以下。

### Kohuki_RepeatMasker.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 48
echo start at
date

source ~/tools/pyenv_env/EDTA_profile

RepeatMasker -pa 12 -lib ~/tools/for_softmask/kohuki_softmask/RM_20252.MonJun171354072024/consensi.fa.classified  /home/kosukesano/tools/for_softmask/kohuki_softmask/180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta
date

コフキゲノムのProcessRepeat

以下のスクリプトをジョブとして投げた。

### Kohuki_ProcessRepeat.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 48
echo start at
date

source ~/tools/pyenv_env/EDTA_profile

ProcessRepeats -maskSource 180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta -xsmall -gff 180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta.cat.gz

date

0702

ReciprocalBestHitの検索


# -*- coding: utf-8 -*-
import pandas as pd
engine='python'

print('start')
# forward BLAST結果の読み込み
forward = pd.read_csv('out_madara_blastp_test.txt', sep='\s+', header=None, low_memory=False)
forward.columns = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore']
dtypes = {
    'qseqid': str,
    'sseqid': str,
    'pident': float,
    'length': int,
    'mismatch': int,
    'gapopen': int,
    'qstart': int,
    'qend': int,
    'sstart': int,
    'send': int,
    'evalue': float,
    'bitscore': float
}

print('forward_BLAST was scanned')

# 逆BLAST結果の読み込みと処理
reverse_files = ['out_Sory_blastp_RefAsMadara.txt', 'out_Tcas_blastp_RefAsMadara.txt', 'out_Dmel_blastp_RefAsMadara.txt', 'out_Ecol_blastp_RefAsMadara.txt']
rbh_hits = []

for reverse_file in reverse_files:
    reverse = pd.read_csv(reverse_file, sep='\s+', header=None, low_memory=False)
    reverse.columns = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore']
    print('reverse_BLAST', reverse, ' was scanned')
    # RBHの判定
    for idx, row in forward.iterrows():
        query, subject = row['qseqid'], row['sseqid']
        reverse_hit = reverse[(reverse['qseqid'] == subject) & (reverse['sseqid'] == query)]
        if not reverse_hit.empty:
            rbh_hits.append(row)
        #print(reverse, ' was judged')
         print(query, 'was judged')

# RBH結果の保存
rbh_df = pd.DataFrame(rbh_hits)
rbh_df.to_csv('reciprocal_best_hits_madara.txt', sep='\s+', index=False)

これを実行したのだが、めちゃくちゃ遅い。どうもforward.iterrows()が悪さをしているらしい。

forward.iterrows()を使わないスクリプトを作成し、実行。

### new_rbh.pyの中身

import pandas as pd

print('start')

# forward BLAST結果の読み込み
forward = pd.read_csv('out_madara_blastp_test.txt', sep='\s+', header=None, low_memory=False)
forward.columns = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore']
dtypes = {
    'qseqid': str,
    'sseqid': str,
    'pident': float,
    'length': int,
    'mismatch': int,
    'gapopen': int,
    'qstart': int,
    'qend': int,
    'sstart': int,
    'send': int,
    'evalue': float,
    'bitscore': float
}

print('forward_BLAST was scanned')

# 逆BLAST結果の読み込みと処理
reverse_files = ['out_Sory_blastp_RefAsMadara.txt', 'out_Tcas_blastp_RefAsMadara.txt', 'out_Dmel_blastp_RefAsMadara.txt', 'out_Ecol_blastp_RefAsMadara.txt']
rbh_hits = []

for reverse_file in reverse_files:
    reverse = pd.read_csv(reverse_file, sep='\s+', header=None, low_memory=False)
    reverse.columns = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore']
    print('reverse_BLAST', reverse_file, ' was scanned')

    # forwardとreverseをマージし、条件に合う行を抽出
    merged = forward.merge(reverse, left_on=['qseqid', 'sseqid'], right_on=['sseqid', 'qseqid'], suffixes=('_fwd', '_rev'))
    rbh_hits.extend(merged.to_dict('records'))
    print(len(merged), 'hits were judged')

# RBH結果の保存
rbh_df = pd.DataFrame(rbh_hits)
rbh_df.to_csv('reciprocal_best_hits_madara.txt', sep='\t', index=False)

import pandas as pd

print('start')

# forward BLAST結果の読み込み
forward = pd.read_csv('out_madara_blastp_test.txt', sep='\s+', header=None, low_memory=False)
forward.columns = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore']
dtypes = {
    'qseqid': str,
    'sseqid': str,
    'pident': float,
    'length': int,
    'mismatch': int,
    'gapopen': int,
    'qstart': int,
    'qend': int,
    'sstart': int,
    'send': int,
    'evalue': float,
    'bitscore': float
}

print('forward_BLAST was scanned')

# 逆BLAST結果の読み込みと処理
reverse_files = ['out_Sory_blastp_RefAsMadara.txt', 'out_Tcas_blastp_RefAsMadara.txt', 'out_Dmel_blastp_RefAsMadara.txt', 'out_Ecol_blastp_RefAsMadara.txt']
rbh_hits = []

for reverse_file in reverse_files:
    reverse = pd.read_csv(reverse_file, sep='\s+', header=None, low_memory=False)
    reverse.columns = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore']
    print('reverse_BLAST', reverse_file, ' was scanned')

    # forwardとreverseをマージし、条件に合う行を抽出
    merged = forward.merge(reverse, left_on=['qseqid', 'sseqid'], right_on=['sseqid', 'qseqid'], suffixes=('_fwd', '_rev'))
    rbh_hits.extend(merged.to_dict('records'))
    print(len(merged), 'hits were judged')

# RBH結果の保存
rbh_df = pd.DataFrame(rbh_hits)
rbh_df.to_csv('reciprocal_best_hits_madara.txt', sep='\t', index=False)

成功!

~/reference_sequencereciprocal_best_hits_madara.txtができた。

### reciprocal_best_hits_madara.txtの中身

qseqid_fwd      sseqid_fwd      pident_fwd      length_fwd      mismatch_fwd    gapopen_fwd     qstart_fwd      qend_fwd        sstart_fwd      send_fwd        evalue_fwd      bitscore_fwd    qseqid_rev      sseqid_rev      pident_rev      length_rev      mismatch_rev    gapopen_rev     qstart_rev      qend_rev        sstart_rev      send_rev        evalue_rev      bitscore_rev
g704.t1 Sory_XP_030750051.1     82.587  201     34      1       1       201     1       200     8.77e-99        336     Sory_XP_030750051.1     g704.t1 83.505  194     31      1       1       193     1       194     1.07e-99        336.0
g704.t1 Sory_XP_030750049.1     82.587  201     34      1       1       201     1       200     2.24e-98        335     Sory_XP_030750049.1     g704.t1 83.505  194     31      1       1       193     1       194     2.67e-99        336.0
g704.t1 Sory_XP_030750048.1     82.587  201     34      1       1       201     1       200     2.85e-98        336     Sory_XP_030750048.1     g704.t1 83.505  194     31      1       1       193     1       194     2.54e-99        336.0
g704.t1 Sory_XP_030750047.1     82.587  201     34      1       1       201     1       200     2.85e-98        336     Sory_XP_030750047.1     g704.t1 83.505  194     31      1       1       193     1       194     2.54e-99        336.0
g704.t1 Sory_XP_030750050.1     86.364  88      12      0       114     201     89      176     4.61e-31        134     Sory_XP_030750050.1     g704.t1 88.889  81      9       0       89      169     114     194     1.59e-31        132.0
g704.t1 Sory_XP_030750053.1     80.851  94      16      1       108     201     2       93      8.27e-31        132     Sory_XP_030750053.1     g704.t1 84.706  85      11      1       4       86      110     194     4.19e-31        131.0
g704.t1 Sory_XP_030750052.1     86.207  87      12      0       115     201     14      100     1.59e-30        132     Sory_XP_030750052.1     g704.t1 89.873  79      8       0       15      93      116     194     6.350000000000001e-31   130.0
g706.t1 Sory_XP_030750036.1     85.816  423     55      4       1       423     1       418     0.0     746     Sory_XP_030750036.1     g706.t1 85.816  423     55      4       1       418     1       423     0.0     724.0
g706.t1 Sory_XP_030750034.1     85.309  388     52      4       36      423     46      428     0.0     675     Sory_XP_030750034.1     g706.t1 85.309  388     52      4       46      428     36      423     0.0     655.0
g706.t1 Sory_XP_030750033.1     84.478  393     52      5       35      423     62      449     0.0     674     Sory_XP_030750033.1     g706.t1 84.478  393     52      5       62      449     35      423     0.0     654.0
g706.t1 Sory_XP_030750037.1     86.126  382     48      4       42      423     13      389     0.0     672     Sory_XP_030750037.1     g706.t1 86.126  382     48      4       13      389     42      423     0.0     650.0
g706.t1 Sory_XP_030750035.1     84.655  391     54      5       33      423     39      423     0.0     671     Sory_XP_030750035.1     g706.t1 84.655  391     54      5       39      423     33      423     0.0     652.0
g706.t1 Sory_XP_030763203.1     85.676  370     46      4       27      396     17      379     0.0     650     Sory_XP_030763203.1     g706.t1 85.676  370     46      4       17      379     27      396   

コフキゲノムのProcessRepeat結果

### 180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta.maskedの中身

>0 edges=0 left=4 right=5 ver=1.10 style=1
ACCCAGTCCCACATCCTTCATATCCACCAGTATATTAGAAGCAATTTTCT
CATCTCTATTTGGCTCCGTTACTTTACTTGTTATGACCTACAATATTTTA
ATTGAAATCAGTACTTTCACATCAATACTGATATTTACTAGTTTTGTTGA
AACCGACCGATACCGATTTTTTGGTCATATCAGTCATTGATCCGATTTTC
CTGGCCTGCCGTTTACGCAAATTAAGTTAATTATTAAGTAACTACATGAC
TTAAAATTTCTCTAAATTAAAGTTACTCACTATAACTAAAATATTATTGT
AATGAGTAAGATTCCACATTATTAGACAATGTGTAACcagaggcggtttt
tccattggtttatttgtgcagtaaccacccaataaaataatacagttttc
aaattctcacaatatatcattaaaataatataactttatattctattaca
taaattattatttacaggttaagctctaTAAgtggaatttataaaaaaaa
tagaaatagtttcggaaatcgagtgcttatataaaagaatggacttcctc
ggccgaagataaacttagtttaagaatgtgtcataagaaactaaccccca
aattaaccaaagggatttaatttaaactaggtctaaattgacaatcgcaa
aatggagccagctgagtattcaagatatttggcgtttttgtcattgcatt
gatttattaaaatattttacataaataataatttaaaaaaagtttaaacg
tagttttaaggttgaacaatattgaataatttcgttggttTTAATCGAAA
ATTTAATTATTAGTAAGATTAAAACACtatgtttttgggccacgccccaa
atttttttagaaggttagaaaatatattgtttttatagtacacaattaat
atttttatggtaaatcaatattatagcttgttaaccatagacaaaccctc
tttgtgcgaaagtgggcctaaaaccaagggctacaaataaaaggagagcg
atatgctaa

成功!

コフキゲノムのBRAKER実行に移る。

コフキゲノムのBRAKER

~/tools/for_braker/nama_data180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta.maskedkohuki_softmasked.fastaという名前でコピー。

~/tools/for_braker/Kohukiディレクトリを作成し、その直下で以下のシェルスクリプトを作成した。

### kohuki_braker.shの中身


#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 16
echo start at
date

source /home/kosukesano/tools/pyenv_env/braker_profile

braker.pl --genome=/home/kosukesano/tools/for_braker/nama_data/kohuki_softmasked.fasta\
        --prot_seq=/home/kosukesano/tools/Arthropoda.fa\
        --threads=16\
        --AUGUSTUS_CONFIG_PATH=/usr/share/augustus/config\
        --AUGUSTUS_BIN_PATH=/usr/bin\
        --AUGUSTUS_SCRIPTS_PATH=/usr/share/augustus/scripts\
        --GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin\
        --PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin\
        --TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin

echo end at
date

これをジョブとして投げた。

コフキゲノムのBUSCO

~/tools/for_braker/nama_datakohuki_busco.shを作成し、ジョブとして投げた

### kohuki_busco.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 24
echo start at
date


singularity exec -e /usr/local/biotools/b/busco:5.1.3--pyhdfd78af_0 busco\
        -m genome\
        -i /home/kosukesano/tools/for_braker/nama_data/kohuki_softmasked.fasta\
        -o BUSCO_OUTPUT_KOHUKI_GENOME\
        -l\
        /home/kosukesano/old_envilonment_until20240430/busco_downloads/busco_downloads/lineages/arthropoda_odb10/\
        -f

date

0703

ReciprocalBestHitの検索

それぞれの生物種ごとにRBHを検索し、bitscoreが最も高い行のみを抽出する。bitscoreが同じ場合は一番上のもののみを抽出する。

上手くいったスクリプトは以下の通り。/home/kosukesano/reference_sequence/new_rbh.py

### new_rbh.pyの中身

import pandas as pd

print('start')

# forward BLAST結果の読み込み
forward = pd.read_csv('out_madara_blastp_test.txt', sep='\s+', header=None, low_memory=False)
forward.columns = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore']
dtypes = {
    'qseqid': str,
    'sseqid': str,
    'pident': float,
    'length': int,
    'mismatch': int,
    'gapopen': int,
    'qstart': int,
    'qend': int,
    'sstart': int,
    'send': int,
    'evalue': float,
    'bitscore': float
}

print('forward_BLAST was scanned')

# 逆BLAST結果の読み込みと処理
reverse_files = ['out_Sory_blastp_RefAsMadara.txt', 'out_Tcas_blastp_RefAsMadara.txt', 'out_Dmel_blastp_RefAsMadara.txt', 'out_Ecol_blastp_RefAsMadara.txt']

for reverse_file in reverse_files:
    reverse = pd.read_csv(reverse_file, sep='\s+', header=None, low_memory=False)
    reverse.columns = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore']

    # forwardとreverseをマージし、条件に合う行を抽出
    merged = forward.merge(reverse, left_on=['qseqid', 'sseqid'], right_on=['sseqid', 'qseqid'], suffixes=('_fwd', '_rev'))
    print(len(merged), 'hits were judged')

# bitscore_fwdを数値型に変換
    merged['bitscore_fwd'] = pd.to_numeric(merged['bitscore_fwd'], errors='coerce')

    # qseqid_fwdが重複する行の中で、bitscore_fwdが最も高い行だけを抽出
    idx = merged.groupby('qseqid_fwd')['bitscore_fwd'].idxmax()
    best_hits = merged.loc[idx]

    # 各ファイルごとにRBH結果を保存
    output_file = f'reciprocal_best_hits_{reverse_file.split("_")[1]}.txt'
    best_hits.to_csv(output_file, sep='\t', index=False)  # タブ文字を区切り文字として使用

print('All RBH results have been saved.')

実行結果は以下の通り。

kosukesano@at137:~/reference_sequence$ python new_rbh.py 
start
forward_BLAST was scanned
4877948 hits were judged
12427340 hits were judged
8626859 hits were judged
21265 hits were judged
All RBH results have been saved.
kosukesano@at137:~/reference_sequence$ 
kosukesano@at137:~/reference_sequence$ ls reciprocal_best_hits_*
reciprocal_best_hits_Dmel.txt  reciprocal_best_hits_Ecol.txt  reciprocal_best_hits_Sory.txt  reciprocal_best_hits_Tcas.txt  reciprocal_best_hits_madara.txt
kosukesano@at137:~/reference_sequence$

reciprocal_best_hits_Dmel.txtreciprocal_best_hits_Ecol.txtreciprocal_best_hits_Sory.txtreciprocal_best_hits_Tcas.txtが出力された。

これらのファイルを遺伝子名で結合する

### merge_rbh.pyの中身

import pandas as pd

# 出力ファイルのリスト
output_files = ['reciprocal_best_hits_Ecol.txt', 'reciprocal_best_hits_Dmel.txt', 'reciprocal_best_hits_Tcas.txt', 'reciprocal_best_hits_Sory.txt']

# 各ファイルからqseqid_fwdとsseqid_fwdを抽出
dataframes = []
for file in output_files:
    df = pd.read_csv(file, sep='\t', low_memory=False)
    df = df[['qseqid_fwd', 'sseqid_fwd']]
    df.columns = ['qseqid_fwd', f'sseqid_fwd_{file.split("_")[2]}']
    dataframes.append(df)

# qseqid_fwdを基準に横に結合
merged_df = dataframes[0]
for df in dataframes[1:]:
    merged_df = pd.merge(merged_df, df, on='qseqid_fwd', how='outer')

# 結果を保存
merged_df.to_csv('merged_best_hits.txt', sep='\t', index=False)

print('Merged results have been saved.')

実行結果は以下の通り

kosukesano@at137:~/reference_sequence$ python merge_rbh.py 
/lustre7/home/kosukesano/reference_sequence/merge_rbh.py:17: FutureWarning: Passing 'suffixes' which cause duplicate columns {'sseqid_fwd_hits_x'} in the result is deprecated and will raise a MergeError in a future version.
  merged_df = pd.merge(merged_df, df, on='qseqid_fwd', how='outer')
Merged results have been saved.
kosukesano@at137:~/reference_sequence$

0705

フェモラータのゲノムが違う?

最新版のフェモラータゲノム(pilonでアセンブルされている)を/Volumes/Elements_1/240705/2023.11.22.polished.annotated.genomeに置いた。

:/Volumes/Elements_1/240705/2023.11.22.polished.annotated.genome$ ls
Bessho                        Sfem_protein.faa              assembly.pilon.annotation.txt bwa_pilon_sagra_20231019.txt
Sfem_cds.fasta                assembly.pilon.annotation.gff assembly.pilon.fasta          eukaryotic_gene_finding.pdf

遺伝研スパコンで~/tools/for_softmask/nama_data/Sfem_pilonディレクトリを作成し、assembly.pilon.fastaをコピーした。

###ローカルで実行
:~/Desktop$ scp /Volumes/Elements_1/240705/2023.11.22.polished.annotated.genome/assembly.pilon.fasta kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_softmask/nama_data/Sfem_pilon
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
|  ..o.o...*   o+ |
|   . . ..= + o* o|
|       .  = oB +.|
|      +.oo .+E+o.|
|      .*S.  o.o.+|
|      .o. .  . .+|
|      ..   +  . o|
|        . ..oo . |
|         . .=.   |
+----[SHA256]-----+
assembly.pilon.fasta                                                                                                                                             100%  479MB  75.7MB/s   00:06    
:~/Desktop$
###遺伝研スパコンの様子

kosukesano@at137:~/tools/for_softmask/nama_data/Sfem_pilon$ ls
assembly.pilon.fasta

新規フェモラータゲノムのソフトマスク

遺伝研スパコンにて~/tools/for_softmask/Sfemorata_pilon_softmaskディレクトリを作成し、以下を実行。

kosukesano@at137:~/tools/for_softmask/Sfemorata_pilon_softmask$ source ~/pyenv_conda_environment/.pyenv_profile
kosukesano@at137:~/tools/for_softmask/Sfemorata_pilon_softmask$ source ~/tools/pyenv_env/EDTA_profile
(EDTA2) kosukesano@at137:~/tools/for_softmask/Sfemorata_pilon_softmask$ BuildDatabase -name Sfem_BLAST_DATABASE ~/tools/for_softmask/nama_data/Sfem_pilon/assembly.pilon.fasta
Building database Sfem_BLAST_DATABASE:
  Reading /home/kosukesano/tools/for_softmask/nama_data/Sfem_pilon/assembly.pilon.fasta...
Number of sequences (bp) added to database: 5084 ( 495481058 bp )
(EDTA2) kosukesano@at137:~/tools/for_softmask/Sfemorata_pilon_softmask$ ls
Sfem_BLAST_DATABASE.nhr  Sfem_BLAST_DATABASE.njs  Sfem_BLAST_DATABASE.nni  Sfem_BLAST_DATABASE.nsq
Sfem_BLAST_DATABASE.nin  Sfem_BLAST_DATABASE.nnd  Sfem_BLAST_DATABASE.nog  Sfem_BLAST_DATABASE.translation
(EDTA2) kosukesano@at137:~/tools/for_softmask/Sfemorata_pilon_softmask$

Sfem_RepeatModeler.shを作成しジョブとして投げた。

#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 24
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date

source ~/tools/pyenv_env/EDTA_profile

RepeatModeler -database Sfem_BLAST_DATABASE -pa 6
date

昔フェモラータのソフトマスクしたときのファイルをほぼコピーしただけ

blastpの結果と機能アノテーションの紐付け

まずショウジョウバエのgenomic.gffから、遺伝子IDと機能情報を対応させた辞書を作る。作成用のスクリプトは以下の通り

### makedic_test.pyの中身

import pandas as pd
import re

# GFFファイルのパス
gff_file = '/home/kosukesano/old_envilonment_until20240430/outgroup/Drosophila_melanogaster/ncbi_dataset/data/GCF_000001215.4/genomic.gff'
# merged_best_hits.txtのパス
merged_file = 'merged_best_hits.txt'
# gene_function.txtのパス
gene_function_file = 'gene_function.txt'

# GFFファイルから遺伝子IDとタンパク質の機能名を抽出
gene_function = {}

with open(gff_file, 'r') as file:
    for line in file:
        if line.startswith('#') or line.strip() == '':
            continue
        parts = line.strip().split('\t')
        if len(parts) < 9:
            continue
        attributes = parts[8]
        match_gene_id = re.search(r'GeneID:(\d+)', attributes)
        match_product = re.search(r'product=([^;]+)', attributes)
        if match_gene_id and match_product:
            gene_id = match_gene_id.group(1)
            product = match_product.group(1)
            gene_function[gene_id] = product

# gene_functionをgene_function.txtとして保存
with open(gene_function_file, 'w') as file:
    for gene_id, product in gene_function.items():
        file.write(f"{gene_id}\t{product}\n")

0708

blastpの結果と機能アノテーションの紐付け続き

各生物のprotein.faaのヘッダー行を抽出して辞書を作成、それを次々に結合。

実行コードが記されたpythonファイルは以下の通り。

### fnanno.pyの中身

import pandas as pd
# merged_best_hits.txtのパスと読み込み
merged_best_hits_file = 'merged_best_hits.txt'
merged_df = pd.read_csv(merged_best_hits_file, sep='\t')

#出力ファイルのパス
output_file = 'merged_with_gene_function.csv'
output_file2 = 'merged_with_gene_function.txt'

##########################################################
###Dmel
# Dmel_protein.faaのパス
dmel_protein_file = 'Dmel_protein.faa'

# 遺伝子IDと遺伝子機能を抽出するリスト
gene_ids = []
gene_functions = []

# Dmel_protein.faaファイルを読み込み、ヘッダー行から情報を抽出
with open(dmel_protein_file, 'r') as file:
    for line in file:
        if line.startswith('>'):
            parts = line.strip().split(' ', 1)
            gene_id = "Dmel_" + parts[0][1:]  # ">"を取り除き、"Dmel_"を追加
            parts2 = parts[1].split(' [')
            gene_function = parts2[0]  # 遺伝子機能
            gene_ids.append(gene_id)
            gene_functions.append(gene_function)

# 遺伝子IDと遺伝子機能のデータフレームを作成
data = {'GeneID': gene_ids, 'GeneFunction': gene_functions}
df = pd.DataFrame(data)

# マージ
merged_df = pd.merge(
    merged_df,
    df,
    left_on='Dmelanogaster',
    right_on='GeneID',
    how='left'
)

# 不要なGeneID列を削除し、列名をDmel_GeneFunctionに変更
merged_df = merged_df.drop(columns=['GeneID'])
merged_df = merged_df.rename(columns={'GeneFunction': 'Dmel_GeneFunction'})

# 確認のため最初の数行を表示
print("Dmel")
print(merged_df)
####################################################################
###Ecol

# Ecol_protein.faaのパス
Ecol_protein_file = 'Ecol_protein.faa'

# 遺伝子IDと遺伝子機能を抽出するリスト
gene_ids = []
gene_functions = []

# Ecol_protein.faaファイルを読み込み、ヘッダー行から情報を抽出
with open(Ecol_protein_file, 'r') as file:
    for line in file:
        if line.startswith('>'):
            parts = line.strip().split(' ', 1)
            gene_id = "Ecol_" + parts[0][1:]  # ">"を取り除き、"Ecol_"を追加
            parts2 = parts[1].split(' [')
            gene_function = parts2[0]  # 遺伝子機能
            gene_ids.append(gene_id)
            gene_functions.append(gene_function)

# 遺伝子IDと遺伝子機能のデータフレームを作成
data = {'GeneID': gene_ids, 'GeneFunction': gene_functions}
df = pd.DataFrame(data)

# マージ
merged_df = pd.merge(
    merged_df,
    df,
    left_on='Ecoli',
    right_on='GeneID',
    how='left'
)

# 不要なGeneID列を削除し、列名をEcol_GeneFunctionに変更
merged_df = merged_df.drop(columns=['GeneID'])
merged_df = merged_df.rename(columns={'GeneFunction': 'Ecol_GeneFunction'})

# 確認のため最初の数行を表示
print("Ecol")
print(merged_df)
####################################################################
###Tcas

# Tcas_protein.faaのパス
Tcas_protein_file = 'Tcas_protein.faa'

# 遺伝子IDと遺伝子機能を抽出するリスト
gene_ids = []
gene_functions = []

# Tcas_protein.faaファイルを読み込み、ヘッダー行から情報を抽出
with open(Tcas_protein_file, 'r') as file:
    for line in file:
        if line.startswith('>'):
            parts = line.strip().split(' ', 1)
            gene_id = "Tcas_" + parts[0][1:]  # ">"を取り除き、"Tcas_"を追加
            parts2 = parts[1].split(' [')
            gene_function = parts2[0]  # 遺伝子機能
            gene_ids.append(gene_id)
            gene_functions.append(gene_function)

# 遺伝子IDと遺伝子機能のデータフレームを作成
data = {'GeneID': gene_ids, 'GeneFunction': gene_functions}
df = pd.DataFrame(data)

# マージ
merged_df = pd.merge(
    merged_df,
    df,
    left_on='Tcastaneum',
    right_on='GeneID',
    how='left'
)

# 不要なGeneID列を削除し、列名をTcas_GeneFunctionに変更
merged_df = merged_df.drop(columns=['GeneID'])
merged_df = merged_df.rename(columns={'GeneFunction': 'Tcas_GeneFunction'})

# 確認のため最初の数行を表示
print("Tcas")
print(merged_df)
####################################################################
###Sory

# Sory_protein.faaのパス
Sory_protein_file = 'Sory_protein.faa'

# 遺伝子IDと遺伝子機能を抽出するリスト
gene_ids = []
gene_functions = []

# Sory_protein.faaファイルを読み込み、ヘッダー行から情報を抽出
with open(Sory_protein_file, 'r') as file:
    for line in file:
        if line.startswith('>'):
            parts = line.strip().split(' ', 1)
            gene_id = "Sory_" + parts[0][1:]  # ">"を取り除き、"Sory_"を追加
            parts2 = parts[1].split(' [')
            gene_function = parts2[0]  # 遺伝子機能
            gene_ids.append(gene_id)
            gene_functions.append(gene_function)

# 遺伝子IDと遺伝子機能のデータフレームを作成
data = {'GeneID': gene_ids, 'GeneFunction': gene_functions}
df = pd.DataFrame(data)

# マージ
merged_df = pd.merge(
    merged_df,
    df,
    left_on='Soryzae',
    right_on='GeneID',
    how='left'
)

# 不要なGeneID列を削除し、列名をSory_GeneFunctionに変更
merged_df = merged_df.drop(columns=['GeneID'])
merged_df = merged_df.rename(columns={'GeneFunction': 'Sory_GeneFunction'})

# 確認のため最初の数行を表示
print("Sory")
print(merged_df)
#########################################################

merged_df_with_function = merged_df.reindex(columns=['Madara',\
 'Ecoli','Ecol_GeneFunction',\
 'Dmelanogaster', 'Dmel_GeneFunction',\
  'Tcastaneum', 'Tcas_GeneFunction',\
   'Soryzae', 'Sory_GeneFunction'])

# 新しいデータフレームをCSVファイルとして保存
merged_df_with_function.to_csv(output_file, index=False)

# 新しいデータフレームをTXTファイルとして保存
merged_df_with_function.to_csv(output_file2, sep='\t', index=False)  # タブ文字を区切り文字として使用

# 確認のため最初の数行を表示
print(merged_df_with_function)

0709

コフキゲノムのBRAKER終了

~/tools/for_braker/Kohuki/braker/ディレクトリが構築され、結果が出力された。 一方でkohuki_braker.sh.o26238954には以下のWARNING MESSAGEが出力された。

start at
Wed Jul  3 01:40:00 JST 2024
# Wed Jul  3 01:40:37 2024: Log information is stored in file /lustre7/home/kosukesano/tools/for_braker/Kohuki/braker/braker.log
#*********
# WARNING: in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 1413
file /lustre7/home/kosukesano/tools/for_braker/Kohuki/braker/genome.fa contains a highly fragmented assembly (2372896 scaffolds). This may lead to problems when running AUGUSTUS via braker in parallelized mode. You set --threads=16. You should run braker.pl in linear mode on such genomes, though (--threads=1).
#*********

contigが細かすぎる?

threads=1にしたコフキゲノムのBRAKER

~/tools/for_braker/Kohuki_thread_oneを作成、以下のスクリプトをジョブとして投げた。

#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 16
echo start at
date

source /home/kosukesano/tools/pyenv_env/braker_profile

braker.pl --genome=/home/kosukesano/tools/for_braker/nama_data/kohuki_softmasked.fasta\
        --prot_seq=/home/kosukesano/tools/Arthropoda.fa\
        --threads=1\
        --AUGUSTUS_CONFIG_PATH=/usr/share/augustus/config\
        --AUGUSTUS_BIN_PATH=/usr/bin\
        --AUGUSTUS_SCRIPTS_PATH=/usr/share/augustus/scripts\
        --GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin\
        --PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin\
        --TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin

echo end at
date

0716

threads=1にしたコフキゲノムのBRAKER結果

変わらず最後まで出力されていない?

kosukesano@at139:~/tools/for_braker/Kohuki_thread_one$ ls
braker  kohuki_braker.sh  kohuki_braker.sh.e26250715  kohuki_braker.sh.o26250715  kohuki_braker.sh.pe26250715  kohuki_braker.sh.po26250715
kosukesano@at139:~/tools/for_braker/Kohuki_thread_one$ ls braker/
GeneMark-EP  GeneMark-ES  GeneMark-ES.stdout  braker.log  errors  gc_content.out  genome.fa  genome_header.map  proteins.fa  species  what-to-cite.txt

フェモラータゲノムのソフトマスク続き

kosukesano@at139:~/tools/for_softmask/Sfemorata_pilon_softmask$ ls RM_76722.MonJul80922572024/
consensi.fa  consensi.fa.classified  families-classified.stk  families.stk  round-1  round-2  round-3  round-4  round-5  round-6  tmpConsensi.fa

RepeatModelerは無事できた。続いてRepeatMaskerに移る。

### Sfem_RepeatMasker.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 24
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date

source ~/tools/pyenv_env/EDTA_profile

RepeatMasker -pa 6 -lib ~/tools/for_softmask/Sfemorata_pilon_softmask/RM_*/consensi.fa.classified  /home/kosukesano/tools/for_softmask/nama_data/../nama_data/Sfem_pilon/assembly.pilon.fasta
date

0718

BRAKER実行結果の確認

seqkitでの確認をしてなかったので改めて確認

### マダラゲノム(RNA_seqデータ含)
kosukesano@at137:~/tools/for_braker/Madara/braker$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat braker.aa
file       format  type     num_seqs    sum_len  min_len  avg_len  max_len
braker.aa  FASTA   Protein    16,570  8,790,187        5    530.5   20,186
######################################################################

手動での系統樹作成

前準備として~/tools/for_orthofinder/make_philo_treeの下にManualPhylo_1.pyを作成、実行した。

###ManualPhylo_1の中身


##analysis_manual.pptxの#46も参照

##AFTER you made MSA file(all_seq.fa) in DDBJ with makeMSA.sh

##時間は10secほど

import numpy as np
import pandas as pd
import os

path = "~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/"
withpath = "../Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/"

OGs = pd.read_table(path + "Orthogroups/Orthogroups.tsv")

##with openは相対パスしか受け付けないらしい
new = pd.DataFrame()
with open(withpath + "Orthogroups/Orthogroups_SingleCopyOrthologues.txt", "r") as fin:
    for line in fin:
        li = line.rstrip()
        new = pd.concat([new, OGs[OGs["Orthogroup"] == li]])
print(new)
new.to_csv(path + "ManualPhylo_data/OG_list.txt", sep = " ", index = False, header = False)

##OG_list.txtと同じ順番の種名リストであるspecies_list.txtを作成
##できたOG_list.txtに、DDBJで作ったall_seq.faで配列情報を与える。

li = []
allspe = OGs.columns.tolist()
allspe2 = allspe[1:len(allspe)]
with open(withpath + "ManualPhylo_data/species_list.txt", "w") as file:
   for column_name in allspe2:
       file.write("%s\n" % column_name)

実行時のコマンド

kosukesano@at137:~/tools/for_orthofinder/make_philo_tree$mkdir ../Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/ManualPhylo_data
kosukesano@at137:python ManualPhylo_1.py

先にManualPhylo_dataディレクトリを作っておかないとうまくいかない。

続いて~/tools/for_orthofinder/make_philo_treeの下にManualPhylo_2.pyを作成、実行した。

###ManualPhylo_2の中身


##ManualPhylo_1.pyの続き

import sys
from Bio import SeqIO

path = "../Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/ManualPhylo_data/"

fasta_in = sys.argv[1]                                  #1番目の引数には上記のall_seq.faなどfastaファイルを指定する
query_in = sys.argv[2]                                  #2番目の引数には上記のOG_list.txtなどオーソログファイルを指定する

for q in open(query_in, "r"):                                           #オーソログファイルを開いて1行づつ読み込む
        query = q.split()                                                       #スペース毎に切りとってリスト形式でqueryに保存する
        f = open(path + query[0], 'w')                                  #最初の列(OG名)と同じ名前のファイルを作成する
        for record in SeqIO.parse(fasta_in, 'fasta'):   #fastaファイルを開くSeqIOを使ってパースする(1項目づつ読み込む)
                id_part = record.id                                     #fastaのID部分を読み込む
                desc_part = record.description                  #fastaのdescription部分を読み込む
                seq = record.seq                                        #fastaの配列部分を読み込む
                for i in range(len(query)):                         #オーソログファイル中の各OGに含まれる配列数を数えて、その分繰り返す(python2の人はrange を x rangeにする)
                        if desc_part == query[i] :                  #オーソログファイルの配列descriptionとfastaの配列descriptionが一致したら、、、
                                fasta_seq = '>' + desc_part + '\n' + seq + '\n'         #fasta形式に整え
                                print(fasta_seq)                                        #標準出力にfastaを出力(進行状況把握用)
                                f.write(str(fasta_seq))                             #各OGファイルにfastaを出力
        f.close()

##できたOGファイルは、align.shやOG_list.txtと同じ場所に
##align.shのある場所までいき、作動。cwdを231016/ManualPhylo_dataにしないとtrimalが作動せず、イライラ

実行時のコマンド

kosukesano@at138:~/tools/for_orthofinder/make_philo_tree$ python ManualPhylo_2.py output_directory/all_seq.fa ../Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/ManualPhylo_data/OG_list.txt 

続く解析で使用するMAFFTtrimalをインストールする。そのために新規mamba環境を作成。

~/tools/pyenv_env/ManualPhilo_profileを作成。

###ManualPhilo_profileの中身


source ~/.bash_profile
source ~/pyenv_conda_environment/.pyenv_profile
pyenv global mambaforge-22.9.0-3



# >>> conda initialize >>>
# !! Contents within this block are managed by 'conda init' !!
__conda_setup="$('/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/bin/conda' 'shell.bash' 'hook' 2> /dev/null)"
if [ $? -eq 0 ]; then
    eval "$__conda_setup"
else
    if [ -f "/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3//etc/profile.d/conda.sh" ]; then
        . "/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/etc/profile.d/conda.sh"
    else
        export PATH="/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/bin:$PATH"
    fi
fi
unset __conda_setup
if [ -f "/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/etc/profile.d/mamba.sh" ]; then
    . "/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/etc/profile.d/mamba.sh"
fi
# <<< conda initialize <<<

conda activate MPT

これをすぐ実行すると、(MPT環境がまだできていないため)mambabase環境が立ち上がる。この状態でMPT環境を作成する。

(MPT) kosukesano@at138:~/tools/for_MAFFT$ mamba install -c bioconda -y mafft

                  __    __    __    __
                 /  \  /  \  /  \  /  \
                /    \/    \/    \/    \
███████████████/  /██/  /██/  /██/  /████████████████████████
              /  / \   / \   / \   / \  \____
             /  /   \_/   \_/   \_/   \    o \__,
            / _/                       \_____/  `
            |/
        ███╗   ███╗ █████╗ ███╗   ███╗██████╗  █████╗
        ████╗ ████║██╔══██╗████╗ ████║██╔══██╗██╔══██╗
        ██╔████╔██║███████║██╔████╔██║██████╔╝███████║
        ██║╚██╔╝██║██╔══██║██║╚██╔╝██║██╔══██╗██╔══██║
        ██║ ╚═╝ ██║██║  ██║██║ ╚═╝ ██║██████╔╝██║  ██║
        ╚═╝     ╚═╝╚═╝  ╚═╝╚═╝     ╚═╝╚═════╝ ╚═╝  ╚═╝

        mamba (1.1.0) supported by @QuantStack

        GitHub:  https://github.com/mamba-org/mamba
        Twitter: https://twitter.com/QuantStack

█████████████████████████████████████████████████████████████


Looking for: ['mafft']

bioconda/linux-64                                    5.6MB @   3.4MB/s  1.8s
bioconda/noarch                                      5.3MB @   2.9MB/s  2.0s
conda-forge/noarch                                  18.0MB @   6.6MB/s  3.2s
conda-forge/linux-64                                42.6MB @   7.5MB/s  6.7s
Transaction

  Prefix: /lustre7/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/envs/MPT

  Updating specs:

   - mafft


  Package               Version  Build        Channel                    Size
───────────────────────────────────────────────────────────────────────────────
  Install:
───────────────────────────────────────────────────────────────────────────────

  + _libgcc_mutex           0.1  conda_forge  conda-forge/linux-64     Cached
  + _openmp_mutex           4.5  2_gnu        conda-forge/linux-64     Cached
  + gawk                  5.3.0  ha916aea_0   conda-forge/linux-64     Cached
  + gettext              0.22.5  h59595ed_2   conda-forge/linux-64      475kB
  + gettext-tools        0.22.5  h59595ed_2   conda-forge/linux-64        3MB
  + gmp                   6.3.0  hac33072_2   conda-forge/linux-64      460kB
  + libasprintf          0.22.5  h661eb56_2   conda-forge/linux-64       43kB
  + libasprintf-devel    0.22.5  h661eb56_2   conda-forge/linux-64       34kB
  + libgcc-ng            14.1.0  h77fa898_0   conda-forge/linux-64      842kB
  + libgettextpo         0.22.5  h59595ed_2   conda-forge/linux-64      171kB
  + libgettextpo-devel   0.22.5  h59595ed_2   conda-forge/linux-64       37kB
  + libgomp              14.1.0  h77fa898_0   conda-forge/linux-64      457kB
  + libstdcxx-ng         14.1.0  hc0a3c3a_0   conda-forge/linux-64        4MB
  + mafft                 7.525  h031d066_1   bioconda/linux-64           3MB
  + mpfr                  4.2.1  h9458935_1   conda-forge/linux-64      643kB
  + ncurses                 6.5  h59595ed_0   conda-forge/linux-64      887kB
  + readline                8.2  h8228510_1   conda-forge/linux-64     Cached

  Summary:

  Install: 17 packages

  Total download: 14MB

───────────────────────────────────────────────────────────────────────────────


libgettextpo                                       170.6kB @ 703.9kB/s  0.2s
libgomp                                            456.9kB @   1.7MB/s  0.3s
libgcc-ng                                          842.1kB @   3.0MB/s  0.3s
libasprintf-devel                                   34.2kB @ 119.1kB/s  0.1s
libstdcxx-ng                                         3.9MB @  12.2MB/s  0.3s
gettext                                            475.1kB @   1.4MB/s  0.1s
mpfr                                               643.1kB @   1.8MB/s  0.1s
ncurses                                            887.5kB @   2.4MB/s  0.1s
libasprintf                                         43.2kB @ 115.9kB/s  0.1s
libgettextpo-devel                                  36.8kB @  90.5kB/s  0.1s
gettext-tools                                        2.7MB @   6.6MB/s  0.4s
gmp                                                460.1kB @   1.0MB/s  0.1s
mafft                                                3.5MB @   7.7MB/s  0.1s
Preparing transaction: done
Verifying transaction: done
Executing transaction: done
(MPT) kosukesano@at138:~/tools/for_MAFFT$ ls
(MPT) kosukesano@at138:~/tools/for_MAFFT$ ls -a
.  ..
(MPT) kosukesano@at138:~/tools/for_MAFFT$ mafft

0719

ASTRALの前準備

~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/ManualPhylo_data以下でmakealltree.shを作成した。

### makealltree.shの中身


#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 16
echo start at
date

# Singularityイメージのパスを指定
SINGULARITY_IMAGE="/usr/local/biotools/i/iqtree:2.3.3--h21ec9f0_0"

# 作業ディレクトリに移動
cd ~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/ManualPhylo_data

# 出力ファイル
output_file="all_trees.nwk"

# 既存の出力ファイルを削除
if [ -f $output_file ]; then
    rm $output_file
fi

# *.maffted.trimed.edit.fa ファイルを処理
for file in *.maffted.trimed.edit.fa; do
    # ファイル名から拡張子を除いたベース名を取得
    base_name=$(basename $file .maffted.trimed.edit.fa)

    # Singularityを使用してIQ-TREEを実行して系統樹を作成
    singularity exec -e $SINGULARITY_IMAGE iqtree2 -s $file -nt AUTO -bb 1000 -cptime 600 -pre ${base_name}

    # 作成された系統樹ファイル (.treefile) を output_file に追加
    if [ -f ${base_name}.treefile ]; then
        echo -n "${base_name}: " >> $output_file
        cat ${base_name}.treefile >> $output_file
        echo "" >> $output_file
    else
        echo "Error: ${base_name}.treefile not found" >&2
    fi
done

echo "All trees have been written to $output_file"

date

0722追記。これを作業ノードで実行したら終わらずタイムアウト。

0722

ASTRAL前準備

makealltree.shqsubで投げた。

フェモラータ新規ゲノムのソフトマスク続き

RepeatMaskerはできた。続いてProcessRepeatsに移る。

### Sfem_ProcessRepeats.sh


#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 24
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date

source ~/tools/pyenv_env/EDTA_profile

ProcessRepeats -maskSource ~/tools/for_softmask/nama_data/Sfem_pilon/assembly.pilon.fasta -xsmall -gff ~/tools/for_softmask/nama_data/Sfem_pilon/assembly.pilon.fasta.cat.gz
date

これをqsubで投げた。

0724

ASTRAL実行

makealltree.shが終わったので出力ファイルを使ってラン

makealltree.sh出力のall_trees.nwk/home/kosukesano/tools/for_ASTRAL/Astral/dataに格納した。


kosukesano@at139:~/tools/for_ASTRAL/Astral$ java -jar astral.5.7.8.jar -i data/all_trees.nwk -o 240724_result/out.tre 2> 240724_result/out.log
kosukesano@at139:~/tools/for_ASTRAL/Astral$ ls 240724_result/
out.log
kosukesano@at139:~/tools/for_ASTRAL/Astral$
### /240724_result/out.logの中身

================== ASTRAL ===================== 

This is ASTRAL version 5.7.8
Gene trees are treated as unrooted
Exception in thread "main" java.lang.RuntimeException: Failed to Parse Tree number: 1
        at phylonet.coalescent.CommandLine.readInputTrees(CommandLine.java:813)
        at phylonet.coalescent.CommandLine.readOptions(CommandLine.java:321)
        at phylonet.coalescent.CommandLine.main(CommandLine.java:486)
Caused by: phylonet.tree.io.ParseException: Number expected
        at phylonet.tree.io.NewickReader.readNode(NewickReader.java:428)
        at phylonet.tree.io.NewickReader.readTree(NewickReader.java:374)
        at phylonet.tree.io.NewickReader.readTree(NewickReader.java:95)
        at phylonet.coalescent.CommandLine.readInputTrees(CommandLine.java:780)
        ... 2 more

0725

ASTRALのラン終了

240724_result/out.logにアウトプットファイルが出力された。

### out.logの中身

================== ASTRAL ===================== 

This is ASTRAL version 5.7.8
Gene trees are treated as unrooted
Exception in thread "main" java.lang.RuntimeException: Failed to Parse Tree number: 1
        at phylonet.coalescent.CommandLine.readInputTrees(CommandLine.java:813)
        at phylonet.coalescent.CommandLine.readOptions(CommandLine.java:321)
        at phylonet.coalescent.CommandLine.main(CommandLine.java:486)
Caused by: phylonet.tree.io.ParseException: Number expected
        at phylonet.tree.io.NewickReader.readNode(NewickReader.java:428)
        at phylonet.tree.io.NewickReader.readTree(NewickReader.java:374)
        at phylonet.tree.io.NewickReader.readTree(NewickReader.java:95)
        at phylonet.coalescent.CommandLine.readInputTrees(CommandLine.java:780)
        ... 2 more

何かエラーを吐いている?

フェモラータ新規ゲノムのソフトマスク完了

RepeatMaskerの結果が返ってきた

kosukesano@at139:~/tools/for_softmask/nama_data/Sfem_pilon$ ls
Sfem_ProcessRepeats.sh            Sfem_ProcessRepeats.sh.pe26282981  assembly.pilon.fasta.cat.gz  assembly.pilon.fasta.out.gff
Sfem_ProcessRepeats.sh.e26282981  Sfem_ProcessRepeats.sh.po26282981  assembly.pilon.fasta.masked  assembly.pilon.fasta.preMonJul220243592024.RMoutput
Sfem_ProcessRepeats.sh.o26282981  assembly.pilon.fasta  

assembly.pilon.fasta.maskedが目的の産物。これを~/tools/for_braker/nama_dataSfem_pilon_softmasked.fastaとしてコピーした。

フェモラータ新規ゲノムのBRAKER

~/tools/for_brakerFemo_pilonを作成し、その下でfemo_braker.shを作成。

### femo_braker.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 16
echo start at
date

source /home/kosukesano/tools/pyenv_env/braker_profile

braker.pl --genome=/home/kosukesano/tools/for_braker/nama_data/Sfem_pilon_softmasked.fasta\
        --prot_seq=/home/kosukesano/tools/Arthropoda.fa\
        --rnaseq_sets_ids=Sfem-1_1,femo-larva_1,femo_H1_1,femo_H3_1,femo_L1_1,femo_L3_1,femo_O1_1,femo_O3_1,femo_T1_1,femo_T3_1,Sfem-1_2,femo-larva_2,femo_H1_2,femo_H3_2,femo_L1_2,femo_L3_2,femo_O1_2,femo_O3_2,femo_T1_2,femo_T3_2,femo-female_1,femo-male_1,femo_H2_1,femo_H4_1,femo_L2_1,femo_L4_1,femo_O2_1,femo_O4_1,femo_T2_1,femo_T4_1,femo-female_2,femo-male_2,femo_H2_2,femo_H4_2,femo_L2_2,femo_L4_2,femo_O2_2,femo_O4_2,femo_T2_2,femo_T4_2 \
        --rnaseq_sets_dir=/home/kosukesano/tools/for_braker/nama_data/Sfem_RNAseq\
        --threads=16\
        --species=Sfemorata_pilon\
        --AUGUSTUS_CONFIG_PATH=/usr/share/augustus/config\
        --AUGUSTUS_BIN_PATH=/usr/bin\
        --AUGUSTUS_SCRIPTS_PATH=/usr/share/augustus/scripts\
        --GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin\
        --PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin\
        --TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin

date

これをqsubで投げた。

0729

フェモラータ新規ゲノムのBRAKER完了

BRAKERは無事動作し、ファイルが出力された。

kosukesano@at137:~/tools/for_braker/Femo_pilon$ ls
braker          femo_braker.sh.e26283956  femo_braker.sh.o26283956  femo_braker.sh.pe26283956  femo_braker.sh.po26283956
femo_braker.sh  femo_braker.sh.e26283961  femo_braker.sh.o26283961  femo_braker.sh.pe26283961  femo_braker.sh.po26283961
kosukesano@at137:~/tools/for_braker/Femo_pilon$ cd braker/
kosukesano@at137:~/tools/for_braker/Femo_pilon/braker$ ls
Augustus  GeneMark-ETP  braker.aa  braker.codingseq  braker.gtf  braker.log  errors  genome_header.map  hintsfile.gff  species  what-to-cite.txt
kosukesano@at137:~/tools/for_braker/Femo_pilon/braker$ 

フェモラータ新規ゲノムのBUSCO

BRAKER後の出力ファイルについて、BUSCOを用いてクオリティチェックを行う。前回のフェモラータゲノムはBRAKER3での出力ファイルのクオリティ値が低かったため、高くなっているといいな。

### femo_busco.shの中身


#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 24
echo start at
date


singularity exec -e /usr/local/biotools/b/busco:5.1.3--pyhdfd78af_0 busco\
        -m protein\
        -i /home/kosukesano/tools/for_braker/Femo_pilon/braker/braker.aa\
        -o BUSCO_OUTPUT_FEMO_WITHRNA\
        -l\
        /home/kosukesano/old_envilonment_until20240430/busco_downloads/busco_downloads/lineages/arthropoda_odb10/\
        -f

date

これをqsubで投げた。

CDS配列を用いたOrthofinder

PAML用にCDS配列のみで6種のOrthofinderを行った。まず~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_CDS_dirディレクトリを作成し、6種の CDS配列をコピーした。


kosukesano@at137:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_CDS_dir$ ls
Agra.fna  Cass.fna  Dpon.fna  Smad.fna  Sory.fna  Tcas.fna
kosukesano@at137:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_CDS_dir$

これをもとに下記シェルスクリプトを記述、実行した。

### Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest_CDS.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -pe def_slot 5
#$ -l medium
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date


singularity exec /usr/local/biotools/o/orthofinder:2.5.4--hdfd78af_0 orthofinder\
        -f ~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_CDS_dir\
        -t 5\
        -a 5\
        -d

date

ASTRALの実行

all_trees.nwkのOG番号が悪さをしているのでは?そこを切り取るコードを書き、実行。

### modify.pyの中身

# 元のファイルと新しいファイルのパスを設定
input_file_path = 'all_trees.nwk'
output_file_path = 'modified_trees.nwk'

# 元のファイルを開いて処理
with open(input_file_path, 'r') as infile, open(output_file_path, 'w') as outfile:
    for line in infile:
        # 行を ': ' で分割し、要素が2つ以上の場合のみ処理
        parts = line.split(': ', 1)
        if len(parts) > 1:
            modified_line = parts[1]
            # 新しいファイルに書き込み
            outfile.write(modified_line)

これを以下の通り実行。

kosukesano@at137:~/tools/for_ASTRAL/Astral/data$ python modify.py 
kosukesano@at137:~/tools/for_ASTRAL/Astral/data$ ls
all_trees.nwk  modified_trees.nwk  modify.py
kosukesano@at137:~/tools/for_ASTRAL/Astral/data$

出力結果は以下の通り

### modified_trees.nwkの中身

(Agra:0.0635712699,Cass:0.0805557052,(Dpon:0.1219665493,(Smad:0.0432864630,(Sory:0.0817207751,Tcas:0.1397248588)70:0.0168256619)38:0.0053674344)54:0.0170031209);
(Agra:0.1655023482,((Cass:0.2728454655,Dpon:0.4276770145)67:0.0351066692,Smad:0.1218709603)53:0.0390916641,(Sory:0.1302322053,Tcas:0.8475027507)100:0.2395842901);
(Agra:0.1498975617,(Cass:0.1228354087,(Dpon:0.1591568837,(Sory:0.1686172722,Tcas:0.6188635095)85:0.0775748177)61:0.0269975353)69:0.0263101100,Smad:0.2004686939);
(Agra:0.1046182209,(Cass:0.1173390835,(Dpon:0.1265307000,Smad:0.0652513950)81:0.0262711442)56:0.0151016525,(Sory:0.0943592513,Tcas:0.3993841186)47:0.0458834130);
(Agra:0.4439818619,((Cass:0.3270195555,Smad:0.3502005062)69:0.0801526161,Dpon:0.5246458008)67:0.0745695349,(Sory:0.4691251648,Tcas:0.5980869091)96:0.3513668760);
(Agra:0.0823664659,(((Cass:0.0603291137,Sory:0.1007744717)54:0.0286917972,Dpon:0.1413255434)38:0.0325222870,Smad:0.0574125897)42:0.0394349257,Tcas:0.2854548697);
(Agra:0.6430510957,Cass:0.4001509671,((Dpon:0.4706496916,Smad:0.4285065087)45:0.0605668934,(Sory:0.4005216201,Tcas:0.7245661234)52:0.0659584460)55:0.0425434177);
(Agra:0.1849238595,(((Cass:0.0720240778,Dpon:0.1265536083)47:0.0193627993,Smad:0.1655456501)18:0.0000020169,Sory:0.1467750516)66:0.0448440021,Tcas:0.2867528256);
(Agra:0.1120432096,(Cass:0.0451799005,Smad:0.0279681266)82:0.0126641184,(Dpon:0.0822097283,(Sory:0.1540588110,Tcas:0.2053968900)98:0.0531593726)83:0.0129938791);
(Agra:0.2307249419,(Cass:0.1422763867,Smad:0.2341206100)49:0.0515095597,(Dpon:0.3216560906,(Sory:0.3535060629,Tcas:0.8600170053)43:0.1084091415)27:0.0133982920);
(Agra:0.0816817581,(Cass:0.1560371473,Smad:0.2263294156)46:0.0450279441,(Dpon:0.1543923964,(Sory:0.1934776920,Tcas:0.5730013110)61:0.0810103641)55:0.0484005680);
(Agra:0.1290642034,(Cass:0.1201414309,Dpon:0.2097540535)34:0.0033281012,(Smad:0.1322768450,(Sory:0.2402523393,Tcas:0.2655477108)89:0.1409222511)85:0.0790612548);
(Agra:0.0774654965,(((Cass:0.0089173938,Dpon:0.0421759868)46:0.0096013194,Sory:0.0138448697)39:0.0179560775,Smad:0.0256089159)77:0.0357100139,Tcas:0.0318576291);
(Agra:0.0805531475,(Cass:0.0617472257,(Dpon:0.0430911645,Smad:0.0471918037)65:0.0138789643)98:0.0342890105,(Sory:0.0783736246,Tcas:0.1968719542)100:0.0780188258);
(Agra:0.2384458487,Cass:0.3002711919,((Dpon:0.2187183648,(Sory:0.2390630242,Tcas:0.7571742224)58:0.0576932185)41:0.0296961138,Smad:0.1436197371)61:0.0378697622);
(Agra:0.0791366226,((Cass:0.0624743725,(Dpon:0.0847100902,Sory:0.1212750817)52:0.0223211948)44:0.0168742678,Smad:0.0311081778)64:0.0315535327,Tcas:0.2833860188);

こちらのファイルを指定してASTRAL.shを投げた。

0730

PAML前準備

~/tools/for_pamlディレクトリに/6sp/data/SCOディレクトリを作成した。

Orthofinder出力のOrthogroups.txtからシングルコピーオーソログのみを抽出する。実行スクリプトは以下の通り。

### ExOG.pyの中身

# ファイルパスの設定
orthogroups_file_path = '/home/kosukesano/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/Orthogroups/Orthogroups.txt'
single_copy_orthologues_file_path = '/home/kosukesano/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/Orthogroups/Orthogroups_SingleCopyOrthologues.txt'
output_file_path = '/home/kosukesano/tools/for_paml/6sp/data/extracted_orthogroups.txt'

# シングルコピーオルソログのIDをセットに格納
single_copy_orthologues = set()
with open(single_copy_orthologues_file_path, 'r') as single_copy_file:
    for line in single_copy_file:
        single_copy_orthologues.add(line.strip())

# Orthogroups.txt から該当する行を抽出して新しいファイルに保存
with open(orthogroups_file_path, 'r') as orthogroups_file, open(output_file_path, 'w') as output_file:
    for line in orthogroups_file:
        # 行の最初の部分を取り出してIDをチェック
        og_id = line.split(':')[0].strip()
        if og_id in single_copy_orthologues:
            output_file.write(line)

これを作業ノードで実行した。

続いて上記スクリプトの出力であるextracted_orthogroups.txtを参照に、各オーソログのprotein ID に対応するCDSをそれぞれのCDSファイルから取り出し、個別のファイルとして格納する。実行スクリプトは以下の通り。

### makefna.py

import os

# 入力ファイルと出力ディレクトリのパスを設定
extracted_orthogroups_path = '/home/kosukesano/tools/for_paml/6sp/data/extracted_orthogroups.txt'
cds_dir = '/home/kosukesano/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_CDS_dir'
output_dir = '/home/kosukesano/tools/for_paml/6sp/data/SCO'

# ファイルの拡張子を変数に
file_extension = ".fna"

# 種ごとのファイル名マッピング
species_to_file = {
    "Cass": "Cass.fna",
    "Tcas": "Tcas.fna",
    "Dpon": "Dpon.fna",
    "Sory": "Sory.fna",
    "Agra": "Agra.fna",
    "Smad": "Smad.fna"
}

# 必要な出力ディレクトリを作成
os.makedirs(output_dir, exist_ok=True)

# `extracted_orthogroups.txt`を読み込み、各オーソログに対して処理
with open(extracted_orthogroups_path, 'r') as infile:
    for line in infile:
        columns = line.strip().split()
        orthogroup_id = columns[0].replace(':', '')
        protein_ids = columns[1:]

        output_file_path = os.path.join(output_dir, f"{orthogroup_id}.fna")

        with open(output_file_path, 'w') as outfile:
            for i, protein_id in enumerate(protein_ids):
                species = list(species_to_file.keys())[i]
                cds_file_path = os.path.join(cds_dir, species_to_file[species])

                # `seqkit grep`コマンドを構築して実行
                grep_command = f"singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit grep -r -p '{protein_id}' {cds_file_path} >> {output_file_path}"
                os.system(grep_command)

これを作業ノードで実行した。2時間くらいかかった。

結果として、~/tools/for_paml/6sp/data/SCOにSCOのCDS配列ファイルが出力された。

kosukesano@at137:~/tools/for_paml/6sp/data/SCO$ ls
OG0008033.fna  OG0008221.fna  OG0008418.fna  OG0008607.fna  OG0008790.fna  OG0008975.fna  OG0009160.fna  OG0009346.fna  OG0009521.fna  OG0009707.fna  OG0009888.fna
OG0008034.fna  OG0008224.fna  OG0008419.fna  OG0008609.fna  OG0008791.fna  OG0008976.fna  OG0009161.fna  OG0009347.fna  OG0009523.fna  OG0009708.fna  OG0009889.fna
OG0008035.fna  OG0008225.fna  OG0008420.fna  OG0008610.fna  OG0008792.fna  OG0008977.fna  OG0009162.fna  OG0009349.fna  OG0009525.fna  OG0009709.fna  OG0009890.fna
OG0008036.fna  OG0008226.fna  OG0008421.fna  OG0008611.fna  OG0008794.fna  OG0008978.fna  OG0009163.fna  OG0009350.fna  OG0009526.fna  OG0009710.fna  OG0009892.fna
OG0008037.fna  OG0008227.fna  OG0008423.fna  OG0008612.fna  OG0008795.fna  OG0008979.fna  OG0009164.fna  OG0009351.fna  OG0009527.fna  OG0009712.fna  OG0009894.fna
OG0008039.fna  OG0008228.fna  OG0008425.fna
.
.
.
.
.
.
### OG0008965.fnaの中身

>lcl|OU892281.1_cds_CAG9769486.1_7692 [locus_tag=CEUTPL_LOCUS9995] [protein_id=CAG9769486.1] [location=complement(join(35353935..35354250,35354441..35354709))] [gbke
y=CDS]
ATGGCTTATTTTCATAAACTAGGTCACCGATTTTTCACAACAAAAGCAATCCAAAACTGG
AATTCCAAAAGTGAGAAATTCAATGAGAAAATAAAGGGAACCATAGTTGAAAAATGGGTA
AAATATTGGAAACTTGTTACCAAAGATTACAAGGAAGTAGGTTTATCTGTTAAACAAGAA
ATTAAAGACAAACCTCTAAAAACTATTGTGTATTTTACTGGAGCAGCTTTATTCGGTTTG
TGTTGGGAGTTAAACCCAAACTTGCAAAGTTTCAGGGCAACATATATAGCATCAGCCAAC
GATCTAAGTTTAGTACCTCTTACTCTAGCGAACCCAAATTCAGTAGAACATTTAAAACAC
ATTGAACAATGTTTCAATCGAAAATATATAAGATATACAAACCTTGGACTTTTATCATTA
ATATGGGTAGATAAATTTAGTGAAGAATGCGATTTATATGAGAGCAACTGCTCATACCTT
AAAGTTCCTTTTTATAAAATAACAGGAAGAATTTTAGATGTTGGCTTTCTAAATGTATGG
TGGATTATTTCTAGAAGAATGCTCGATTATGATATAAATTATTAG
>lcl|NC_007417.3_cds_XP_008201558.1_3086 [db_xref=GeneID:103315214] [protein=uncharacterized protein C19orf52] [protein_id=XP_008201558.1] [location=complement(11429290..11429874)] [gbkey=CDS]
ATGTTGCGATTATCGGGGTTTAATGTTTTTGCTCTTGGAGCAAAAACTATCGAAAATTAT
AAAAAAGCAAGCGAGAGGATCAATAAGAAGATTAGCGGAACTTTCGTTGAAAAAGCAGTC
ATTTATTTGAAAACTGTATGGAATGACTATACCGAAGTAGCTGTCTCTGTTAGGAGCGAC
ATTACGGAGAAACCCCTAAAAGCGGCGGGTTTCTTCACCGGTATGGGCTTCGTAATGTAC
AGTTTAACACACAATCCGGACGAACAAAGTTTCAAAGCGAAATTTATCCAGTGTTCAAAT
GAGGTTTCTTTAGTTAGCCCAAATCTTGTCAATACCGCTGCAGTTGAACACATGAAGATG
ATACAAACTTGTTACAACAGAGACTTAATAAGGTACACAAACTTGGGGCTGTTTTCACTC
GTTTGGGTCGATAAATACAGTGATCAGTGCAACATGTACGAAACAAACTGTTCTTATTTG
CAACTGCCATACAGGAAATTCCCTAGTCACGTCATAGATGTAGGTTTTTTGAATATTTGG
TGGGTCATATCGCGCAAAATGTTAGATTATGACATAAATTATTAA
>lcl|NW_026017110.1_cds_XP_019769948.1_2611 [gene=LOC109544293] [db_xref=GeneID:109544293] [protein=mitochondrial import inner membrane translocase subunit Tim29] [protein_id=XP_019769948.1] [location=complement(join(25238952..25239267,25239322..25239623))] [gbkey=CDS]
ATGTATTTGAAAAATATTGTAAATCAGTCGAAAATGATGAAAATAAATGAACCCCAGTCC
AGATTTTTCACCACGCGTGCGTTGGAAAACTGGAAAACCACCAGTGAAAAGTTTAACGAA
AAAATTAAAGGGACGATTCTAGAAAAATGGGTGAAATATTGGAAAGTTGTGGCCAAAGAC
TACCAAGACGTCGCACTTAATGTGAAACAGGAAATTAAGCAAAAACCTTTAAAATCAACT
GTGTTTTTCACCGGGTCTGCTTTTTTAGGGCTATGCCTGCATCTAAATCCTGATCTAAAA
AGTTTTAGGTCGAAGTACATCGAATCAGCCAACAATTTAAGTTTAGTGCCACTGACGCTG
GCAAATCCAAGATCCGTAGAACATTTAAAGCACATCGAAAGATGTTTCAATCGTAAATTC
ATTCGCTATCAAAACCTGGGATTATTTTCAATTATGTGGGTAGACAAACGTAGTAAGGAG
TGCGATTCATATGAAAGCAACTGTTCATATTTAAAGGTTCCATTTTGGAATGTTAGCAGC
CGAATTTTAGACGTAGGCTTTTTGAATGTATGGTGGATTATTTCAAGGCAGATGCTAGAT
TATGATATTAATTATTAA
>lcl|NW_022146411.1_cds_XP_030754725.1_11873 [gene=LOC115881405] [db_xref=GeneID:115881405] [protein=mitochondrial import inner membrane translocase subunit Tim29] [protein_id=XP_030754725.1] [location=complement(join(1076167..1076482,1076551..1076813))] [gbkey=CDS]
ATGGTCCAATATAATCAGTCACTGTCACTAATATCTCGAACACTAGTCAATATAAAATCC
ACAAATCTTAAGTTTAATGAGAAAATAAAAGGTACCATAGTAGAAAAATGGGTGGCTTAT
TGGAAACTAGTGGCAAAAGATTATAAAGATGTTGGAAGATCATTAAAACAAGATATAAAA
ACAAAACCATTGCGATCTGGTTTATATTTTACAGGTGCAAGTTTGCTAGGACTTTGTGCA
TCTTTAAACCCCGATATGCAAAGTTTTAGAGCAAAATATATTCAATCTGCAAATGATTTA
GGGTTAGTTCCTACTACACTAGCTAACCCTCAAGCCTTAAATCATTTAAAATATATTGAG
AGAAGCTTTAATCACAACCTTATTCGTTACATAAATTTAGGTGTTTTATCGATAATCTGG
GTGGACAAATTTAGTGAAGATTGTAATTTATATGAAAATACTTGTTCTTATCTTCAAGTC
CCATTTTGGGAAATTAGAAAGAGAATGCTTGATATAGGATTTTTAAATGTATGGTGGATA
ACATCCAGAAAAATGCTTGACTATGATATAAATTATTAA
>lcl|NC_065546.1_cds_XP_050302974.1_1493 [gene=LOC126740812] [db_xref=GeneID:126740812] [protein=mitochondrial import inner membrane translocase subunit Tim29] [protein_id=XP_050302974.1] [location=join(50475192..50475457,50483767..50484082)] [gbkey=CDS]
ATGTTGAAGGTATGCAAGAGATTTTACAGTAGTCCAGTAACAGGGAATTCGAACTGGCAG
ACTGTGAGCCATAAGTTTAATCAAAAAATCAAAGGCACTTTCTTGGAAAAATGGGCGAAG
TTTTGGAAAACTGTCGCCAAAGACTATAAAGAAGTTGCCATAAATGTGAAACAGGATATA
AAACAGAAACCATTAAAGGCTGCCGCATACTTCAGTGCATCTGCCTTTGTTGGATTGTGC
ATTCAATTCAACCCAGATTTGCAAAGTTTCAGATCAAAATATGTCCAATCAGCAAATGAA
GTAGGTTTGGTACCTCTTAGCCTAACAAATCCACAAGCTGTAGAGCATTTAAATTACATT
GAAAGGTGTTTTAACCAACAGTTAATTAGGTATGTCAACCTAGGAATATTTTCAATAATA
TGGGTGGATAAATTCAGTAAAGAGTGTGACACCTATGAAAGTAAATGCACATACTTGCAA
GTTCCTTACTGGGGTATACCCAGCAGAATATTAGATATAGGATTTTTAAATGTATGGTGG
ATTACATCTAGAAAAATGTTGGATTATGACATAAATTATTAG
>g3079.t1
ATGTATTCACTGAACAAAATAAGTAGGAGGTTACTCACCACCCGAGCACTGGAAAACTTA
AAATCCACAAATGAAAAATTGAACAATAAAATAAAGGGAACATTCATTGAAAAATGGGTA
AAATATTGGAAACTTATAGCTAAAGACTATCAGGATGTCAGCATTTCAGTTAAACAAGAT
ATTAAAGCAAAACCACTGAGGACAATGGCATATTTTACAGGAGCTGCATTCATAGGTTTA
TGCATTGAATTAAATCCAGATCTGCAAAGTTTTAGGGCAAAATACATTGCATCTGCCAAT
GACCTCAGTTTAGTACCTTTACATTTAGCAAATTCACAAGCTGTTGAGCACTTGAAGTAC
GTGGAACGCTGCTTTAATCGCAAATTTATCAGATATATGAATCTTGGAATTGCATCAGTA
GTATGGGTGGATAAATATAGTAGCGAGTGTGACACCTACGAGAGCAACTGTTCTTATTTA
CAAGTACCTTATTGGAATATAACAGACAGAATATTGGACATAGGCTTCCTAAATGTATGG
TGGATTATTTCCAGAAAAATGTTAGATTATGatataaattactag

こんな感じ!

続いてこれらのファイルをMAFFTによりアライメントする。実行したシェルスクリプトは以下の通り。

### mafft.sh

#$ -S /bin/bash

source ~/tools/pyenv_env/ManualPhilo_profile

# ディレクトリパス
input_dir="/home/kosukesano/tools/for_paml/6sp/data/SCO/"
output_dir="/home/kosukesano/tools/for_paml/6sp/data/SCO/"

# 各ファイルに対してアラインメントを実行
for file in "$input_dir"*.fna; do
  # 元のファイル名から拡張子を除いたものを取得
  base_name=$(basename "$file" .fna)

  # 出力ファイル名を生成
  output_file="${output_dir}${base_name}_maffted.fna"

  # MAFFTを実行
  mafft --auto --maxiterate 1000 --localpair "$file" > "$output_file"

  echo "Aligned file created: $output_file"
done

これをqsubで投げた。

PAMLのテストラン

~/tools/for_paml/testにて以下のコードを書き、PAMLを実行してみた。

### bsA.ctl

      seqfile = /home/kosukesano/tools/for_paml/6sp/data/SCO/OG0008965_maffted.fna
     treefile = /home/kosukesano/tools/for_paml/test/data/tree_ultrametric.nwk
      outfile = result/OG0008965_branch_alt

        noisy = 9   * 0,1,2,3,9: how much rubbish on the screen
      verbose = 1   * 1: detailed output, 0: concise output
      runmode = 0   * 0: user tree;  1: semi-automatic;  2: automatic
                    * 3: StepwiseAddition; (4,5):PerturbationNNI

      seqtype = 2   * 1:codons; 2:AAs; 3:codons-->AAs
    CodonFreq = 2   * 0:1/61 each, 1:F1X4, 2:F3X4, 3:codon table
        clock = 0   * 0: no clock, unrooted tree, 1: clock, rooted tree

     model = 2           * 記号の有無で異なる ω を推定
   NSsites = 0           * サイト間では ω は一定
 fix_omega = 0           * ω の値を配列から推定
     omega = 1           * 推定は ω=1 からスタート



        icode = 0   * 0:standard genetic code; 1:mammalian mt; 2-10:see below

    fix_kappa = 0   * 1: kappa fixed, 0: kappa to be estimated
        kappa = 2   * initial or fixed kappa

    fix_alpha = 1   * 0: estimate gamma shape parameter; 1: fix it at alpha
        alpha = .0  * initial or fixed alpha, 0:infinity (constant rate)
       Malpha = 0   * different alphas for genes
        ncatG = 4   * # of categories in the dG or AdG models of rates

        getSE = 0   * 0: don't want them, 1: want S.E.s of estimates
 RateAncestor = 0   * (1/0): rates (alpha>0) or ancestral states (alpha=0)
       method = 0   * 0: simultaneous; 1: one branch at a time
  fix_blength = 0  * 0: ignore, -1: random, 1: initial, 2: fixed, 3: proportional


* Specifications for duplicating results for the small data set in table 1
* of Yang (1998 MBE 15:568-573).
* see the tree file lysozyme.trees for specification of node (branch) labels

これを以下のように実行。


kosukesano@at137:~/tools/for_paml/test$ singularity exec -e /usr/local/biotools/p/paml:4.9--h779adbc_6 codeml bsA.ctl 

 15         verbose | verbose                1.00
  7         runmode | runmode                0.00
  4         seqtype | seqtype                2.00
 13       CodonFreq | CodonFreq              2.00
  9           clock | clock                  0.00
 16           model | model                  2.00
 20         NSsites | NSsites                0.00
 26       fix_omega | fix_omega              0.00
 27           omega | omega                  1.00
 22           icode | icode                  0.00
 24       fix_kappa | fix_kappa              0.00
 25           kappa | kappa                  2.00
 28       fix_alpha | fix_alpha              1.00
 29           alpha | alpha                  0.00
 30          Malpha | Malpha                 0.00
 31           ncatG | ncatG                  4.00
 11           getSE | getSE                  0.00
 12    RateAncestor | RateAncestor           0.00
  8          method | method                 0.00
 37     fix_blength | fix_blength            0.00
AAML in paml version 4.9, March 2015

processing fasta file
reading seq# 1 lcl|OU892281.1_cds_CAG9769486.1_7692 [locus_tag=CEUTPL_LOCUS9995] [protein_id=CAG9769486.1] [location=complement(join(35353935..35354250,35354441..35354709))] [gbkey=CDS]     624 sites
reading seq# 2 lcl|NC_007417.3_cds_XP_008201558.1_3086 [db_xref=GeneID:103315214] [protein=uncharacterized protein C19orf52] [protein_id=XP_008201558.1] [location=complement(11429290..11429874)] [gbkey=CDS]     624 sites
reading seq# 3 lcl|NW_026017110.1_cds_XP_019769948.1_2611 [gene=LOC109544293] [db_xref=GeneID:109544293] [protein=mitochondrial import inner membrane translocase subunit Tim29] [protein_id=XP_019769948.1] [location=complement(join(25238952..25239267,25239322..25239623)     624 sites
reading seq# 4 lcl|NW_022146411.1_cds_XP_030754725.1_11873 [gene=LOC115881405] [db_xref=GeneID:115881405] [protein=mitochondrial import inner membrane translocase subunit Tim29] [protein_id=XP_030754725.1] [location=complement(join(1076167..1076482,1076551..1076813))]      624 sites
reading seq# 5 lcl|NC_065546.1_cds_XP_050302974.1_1493 [gene=LOC126740812] [db_xref=GeneID:126740812] [protein=mitochondrial import inner membrane translocase subunit Tim29] [protein_id=XP_050302974.1] [location=join(50475192..50475457,50483767..50484082)] [gbkey=CDS]     624 sites
reading seq# 6 g3079.t1                                               624 sites
ns = 6          ls = 624
Reading sequences, sequential format..
Reading seq # 1: lcl|OU892281.1_cds_CAG9769486.1_7692 [locus_tag=CE     

Error in sequence data file: U at 1 seq 1.
Make sure to separate the sequence from its name by 2 or more spaces.
kosukesano@at137:~/tools/for_paml/test$ ls result/
OG0008101_branch_alt  OG0008101_bs_alt  OG0008768_branch_alt  OG0008965_branch_alt

シーケンスデータファイルの形式に問題がある?

2024年8月

0804

CDSを用いたOrthofinderの結果

ファイルの拡張子が合わない?

標準出力ファイルは以下のようになった。

### Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest_CDS.sh.o26291666の中身

start at
Sun Aug  4 00:59:20 JST 2024

OrthoFinder version 2.5.4 Copyright (C) 2014 David Emms

2024-08-04 00:59:25 : Starting OrthoFinder 2.5.4
5 thread(s) for highly parallel tasks (BLAST searches etc.)
5 thread(s) for OrthoFinder algorithm

Checking required programs are installed
----------------------------------------
Test can run "mcl -h" - ok
Test can run "fastme -i /home/kosukesano/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_CDS_dir/OrthoFinder/Results_Aug04/WorkingDirectory/SimpleTest.phy -o /home/kosukesano/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_CDS_dir/OrthoFinder/Results_Aug04/WorkingDirectory/SimpleTest.tre" - ok

WARNING: Files have been ignored as they don't appear to be FASTA files:
Agra.fna
Cass.fna
Dpon.fna
Smad.fna
Sory.fna
Tcas.fna
OrthoFinder expects FASTA files to have one of the following extensions: pep, faa, fa, fas, fasta
ERROR: At least two species are required
ERROR: An error occurred, ***please review the error messages*** they may contain useful information about the problem.
Sun Aug  4 00:59:47 JST 2024
(END)

入力ファイルを.fastaに変更。もう一度qsubで投げた。

CAFEの結果とDEG解析の結果を照合する

以下のコードをrで実行した。

### home/bio/for_cafe/caferesult.Rの中身



library(tidyverse)
Deg<-read.csv("Deg/DEG_ovary_vs_body_DESeq2.csv", sep=",")
og<-read.csv("Original_data/OrthoFinder/Results_Jun25/Orthogroups/Orthogroups.tsv", sep=":", skip=1)
Plami<-read.csv("old_result/Base_change.tab", sep="\t")

View(Plami)

# ファイルを読み込む
file_path <- "old_result/Base_asr.tre"
lines <- readLines(file_path)

print(lines)
# TREESセクションのみを抽出する
trees_start <- which(grepl("BEGIN TREES;", lines))
trees_end <- which(grepl("END;", lines))
trees_lines <- lines[(trees_start + 1):(trees_end - 1)]

# 不要なスペースを削除
trees_lines <- gsub("^\\s+|\\s+$", "", trees_lines)

# データフレームに変換
library(tibble)
trees_df <- tibble(Tree = trees_lines)

ex=trees_df|>###マダラで優位に増減したOGのOG番号を抽出したファイル
#lines|> 
  tidyr::separate(Tree, into = c("OG_num", "tree"), sep = r"(\s=\s)")|>
  dplyr::mutate(OG_num = stringr::str_extract(OG_num, "OG\\d+")) |>
  dplyr::mutate(tree = stringr::str_extract(tree, r"(Smad<0>_)")) |>
  dplyr::mutate(tree = tidyr::replace_na(tree, "significant")) |>
  dplyr::filter(tree == "significant") |>
  print()

View(ex)
#################################################################

Plami2=Plami |>###マダラで増加した0Gの0G番号を抽出したファイル
  dplyr::select("FamilyID","Smad.0.") |>
  dplyr::mutate(Smad.0. = stringr::str_extract(Smad.0., r"(^\d+)")) |>
  tidyr::drop_na()|>
  dplyr::filter(Smad.0. != 0) |>
  print()
View(Plami2)

#################################################################

df=dplyr::inner_join(Plami2, ex, by = c(FamilyID = "OG_num"))|>###マダラで優位に増加したOGのOG番号を抽出したファイル
  print()

##################################################################
# ファイルパスの指定
orthogroups_file <- "Original_data/OrthoFinder/Results_Jun25/Orthogroups/Orthogroups.tsv"

# Orthogroups.tsvの読み込み
orthogroups <- ### OG番号とそれに対応するマダラ遺伝子IDのファイル
  read.delim(orthogroups_file, header=FALSE, sep="\t", 
             #stringsAsFactors=FALSE,
             #col.names = "Data"
             skip=1
  )|>
  dplyr::select("V1", "V5")

# データの最初の数行を表示して確認
head(orthogroups)
View(orthogroups)
################################################################
df2=dplyr::left_join(df, orthogroups, by = c(FamilyID = "V1"))|>
  dplyr::select(!c(Smad.0., tree)) |>
  print()
View(df2)

################################################################

# V5列の遺伝子IDをカンマで区切って、新しいデータフレームを作成

df_expanded <- df2 %>%###マダラでのみ増加した遺伝子のgene_IDとOG番号
  separate_rows(V5, sep = ", ") %>%
  rename(gene_ID = V5, family_ID = FamilyID)|>
  print()
###############################################################
### CAFE5でマダラでのみ増加した遺伝子とその機能のファイル、df3

fa<-read.csv("/Users/kosukesano/bio/functional_annotation/merged_with_gene_function.csv", sep=",")
View(fa)

df3=dplyr::left_join(df_expanded, fa, by = c(gene_ID = "Madara"))|>###完成系
  print()

write.csv(df3, "CAFE_plus_gene.csv", row.names = FALSE)
##############################################################
###DEG解析との結合

deg1=read.csv("/Users/kosukesano/bio/for_cafe/Deg/DEG_ovary_vs_body_DESeq2.csv", sep=",")|>
  print()
View(deg1)

deg1_merge=dplyr::inner_join(df3, deg1, by = "gene_ID")|>
  dplyr::select(!c(family_ID)) |>
  print()
View(deg1_merge)
write.csv(deg1_merge, "DEG_CAFE_ovary_vs_body.csv", row.names = FALSE)


deg2=read.csv("/Users/kosukesano/bio/for_cafe/Deg/DEG_Adult_vs_Larva_DESeq2.csv", sep=",")|>
  print()
View(deg2)

deg2_merge=dplyr::inner_join(df3, deg2, by = "gene_ID")|>
  dplyr::select(!c(family_ID)) |>
  print()
View(deg2_merge)
write.csv(deg2_merge, "DEG_CAFE_adult_vs_larva.csv", row.names = FALSE)

#################################################################


print(c(deg1_merge$Dmelanogaster, deg1_merge$Dmel_GeneFunction))

print(deg2_merge$Dmel_GeneFunction)


sng=df3|>
  dplyr::mutate(Dmelanogaster = stringr::str_replace(Dmelanogaster, "Dmel_", "")) |>
  dplyr::select(Dmelanogaster) |>
  print(n = 488)

0805

PAMLのテスト続き

前回の反省をもとにヘッダー最後に空白を入れた上でPAML実行。以下はヘッダーに空白を加えるスクリプト。

###


import os
import re

# 処理するディレクトリ
directory = '/home/kosukesano/tools/for_paml/6sp/data/SCO'

# 正規表現パターン
pattern = re.compile(r'^>.*')

# ディレクトリ内のファイルを処理
for filename in os.listdir(directory):
    if re.match(r'^OG\d+_maffted\.fna$', filename):
        filepath = os.path.join(directory, filename)
        
        with open(filepath, 'r') as file:
            lines = file.readlines()
        
        new_lines = []
        for line in lines:
            if pattern.match(line):
                # シーケンス名行には2つ以上のスペースを追加
                new_lines.append(re.sub(r'(>.*)', r'\1  ', line.strip()))
            else:
                # シーケンス行には変更を加えない
                new_lines.append(line.strip())
        
        # 整形されたファイルを保存
        with open(filepath, 'w') as file:
            for line in new_lines:
                file.write(line + '\n')

print("All files have been processed.")

これを実行した上で、PAMLを再実行。

kosukesano@at137:~/tools/for_paml/test$ python seq_space_plus.py 
All files have been processed.
kosukesano@at137:~/tools/for_paml/test$ singularity exec -e /usr/local/biotools/p/paml:4.9--h779adbc_6 codeml bsA.ctl 

 15         verbose | verbose                1.00
  7         runmode | runmode                0.00
  4         seqtype | seqtype                2.00
 13       CodonFreq | CodonFreq              2.00
  9           clock | clock                  0.00
 16           model | model                  2.00
 20         NSsites | NSsites                0.00
 26       fix_omega | fix_omega              0.00
 27           omega | omega                  1.00
 22           icode | icode                  0.00
 24       fix_kappa | fix_kappa              0.00
 25           kappa | kappa                  2.00
 28       fix_alpha | fix_alpha              1.00
 29           alpha | alpha                  0.00
 30          Malpha | Malpha                 0.00
 31           ncatG | ncatG                  4.00
 11           getSE | getSE                  0.00
 12    RateAncestor | RateAncestor           0.00
  8          method | method                 0.00
 37     fix_blength | fix_blength            0.00
AAML in paml version 4.9, March 2015

processing fasta file
reading seq# 1 lcl|OU892281.1_cds_CAG9769486.1_7692 [locus_tag=CEUTPL_LOCUS9995] [protein_id=CAG9769486.1] [location=complement(join(35353935..35354250,35354441..35354709))] [gbkey=CDS]       624 sites
reading seq# 2 lcl|NC_007417.3_cds_XP_008201558.1_3086 [db_xref=GeneID:103315214] [protein=uncharacterized protein C19orf52] [protein_id=XP_008201558.1] [location=complement(11429290..11429874)] [gbkey=CDS]       624 sites
reading seq# 3 lcl|NW_026017110.1_cds_XP_019769948.1_2611 [gene=LOC109544293] [db_xref=GeneID:109544293] [protein=mitochondrial import inner membrane translocase subunit Tim29] [protein_id=XP_019769948.1] [location=complement(join(25238952..25239267,25239322..25239623)       624 sites
reading seq# 4 lcl|NW_022146411.1_cds_XP_030754725.1_11873 [gene=LOC115881405] [db_xref=GeneID:115881405] [protein=mitochondrial import inner membrane translocase subunit Tim29] [protein_id=XP_030754725.1] [location=complement(join(1076167..1076482,1076551..1076813))]       624 sites
reading seq# 5 lcl|NC_065546.1_cds_XP_050302974.1_1493 [gene=LOC126740812] [db_xref=GeneID:126740812] [protein=mitochondrial import inner membrane translocase subunit Tim29] [protein_id=XP_050302974.1] [location=join(50475192..50475457,50483767..50484082)] [gbkey=CDS]       624 sites
reading seq# 6 g3079.t1                                               624 sites
ns = 6          ls = 624
Reading sequences, sequential format..
^C
kosukesano@at137:~/tools/for_paml/test$

途中で止まって動かない……。

種名がないのが原因なのでは? CDSを取ってくる際にヘッダーに元ファイルの種名をくっつけるように変更。

###/home/kosukesano/tools/for_paml/6sp/data/makefna_plusname.py の中身

import os

# 入力ファイルと出力ディレクトリのパスを設定
extracted_orthogroups_path = '/home/kosukesano/tools/for_paml/6sp/data/extracted_orthogroups.txt'
cds_dir = '/home/kosukesano/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_CDS_dir'
output_dir = '/home/kosukesano/tools/for_paml/6sp/data/SCO_plusname'

# ファイルの拡張子を変数に
file_extension = ".fasta"

# 種ごとのファイル名マッピング
species_to_file = {
    "Cass": "Cass.fasta",
    "Tcas": "Tcas.fasta",
    "Dpon": "Dpon.fasta",
    "Sory": "Sory.fasta",
    "Agra": "Agra.fasta",
    "Smad": "Smad.fasta"
}

# 必要な出力ディレクトリを作成
os.makedirs(output_dir, exist_ok=True)

# `extracted_orthogroups.txt`を読み込み、各オーソログに対して処理
with open(extracted_orthogroups_path, 'r') as infile:
    for line in infile:
        columns = line.strip().split()
        orthogroup_id = columns[0].replace(':', '')
        protein_ids = columns[1:]

        output_file_path = os.path.join(output_dir, f"{orthogroup_id}{file_extension}")

        with open(output_file_path, 'w') as outfile:
            for i, protein_id in enumerate(protein_ids):
                species = list(species_to_file.keys())[i]
                cds_file_path = os.path.join(cds_dir, species_to_file[species])

                # `seqkit grep`コマンドを構築して実行
                grep_command = f"singularity exec -e /usr/local/biotools/s/seqkit:2.5.0--h9ee0642_0 seqkit grep -r -p '{protein_id}' {cds_file_path}"
                result = os.popen(grep_command).read()
                
                # ヘッダーに種名を追加
                result = result.replace('>', f'>{species}|', 1)
                
                # 出力ファイルに書き込み
                outfile.write(result)

print("All files have been processed.")

試しにこれで出力されたOG0008033.fastaMAFFTでアライメントしてテストプレイ。MAFFTのスクリプトは以下の通り。

###/home/kosukesano/tools/for_paml/6sp/data/mafft_240805test.shの中身

#$ -S /bin/bash

source ~/tools/pyenv_env/ManualPhilo_profile

# ディレクトリパス
input_dir="/home/kosukesano/tools/for_paml/6sp/data/SCO_plusname/"
output_dir="/home/kosukesano/tools/for_paml/6sp/data/SCO/"

# 各ファイルに対してアラインメントを実行
for file in "$input_dir"OG0008033.fasta; do
  # 元のファイル名から拡張子を除いたものを取得
  base_name=$(basename "$file" .fasta)

  # 出力ファイル名を生成
  output_file="${output_dir}${base_name}_maffted.fasta"

  # MAFFTを実行
  mafft --auto --maxiterate 1000 --localpair "$file" > "$output_file"

  echo "Aligned file created: $output_file"
done

これでできたファイルは以下の通り。

>Cass|lcl|OU892281.1_cds_CAG9769486.1_7692 [locus_tag=CEUTPL_LOCUS9995] [protein_id=CAG9769486.1] [location=complement(join(35353935..35354250,35354441..35354709))] [gbke
y=CDS]  
---------------------------------atggcttattttcataaactaggtcac
cgatttttca------caacaaaagcaatccaaaactggaattccaaaagtgagaaattc
aatgagaaaataaagggaaccatagttgaaaaatgggtaaaatattggaaacttgttacc
aaagattacaaggaagtaggtttatctgttaaacaagaaattaaagacaaacctctaaaa
actattgtgtattttactggagcagctttattcggtttgtgttgggagttaaacccaaac
ttgcaaagtttcagggcaacatatatagcatcagccaacgatctaagtttagtacctctt
actctagcgaacccaaattcagtagaacatttaaaacacattgaacaatgtttcaatcga
aaatatataagatatacaaaccttggacttttatcattaatatgggtagataaatttagt
gaagaatgcgatttatatgagagcaactgctcataccttaaagttcctttttataaaata
acaggaagaattttagatgttggctttctaaatgtatggtggattatttctagaagaatg
ctcgattatgatataaattattag
.
.
.

これでPAMLを実行。実行の様子は以下の通り

kosukesano@at138:~/tools/for_paml/test$ singularity exec -e /usr/local/biotools/p/paml:4.9--h779adbc_6 codeml 240805_bsA.ctl 

 15         verbose | verbose                1.00
  7         runmode | runmode                0.00
  4         seqtype | seqtype                2.00
 13       CodonFreq | CodonFreq              2.00
  9           clock | clock                  0.00
 16           model | model                  2.00
 20         NSsites | NSsites                0.00
 26       fix_omega | fix_omega              0.00
 27           omega | omega                  1.00
 22           icode | icode                  0.00
 24       fix_kappa | fix_kappa              0.00
 25           kappa | kappa                  2.00
 28       fix_alpha | fix_alpha              1.00
 29           alpha | alpha                  0.00
 30          Malpha | Malpha                 0.00
 31           ncatG | ncatG                  4.00
 11           getSE | getSE                  0.00
 12    RateAncestor | RateAncestor           0.00
  8          method | method                 0.00
 37     fix_blength | fix_blength            0.00
AAML in paml version 4.9, March 2015

processing fasta file
reading seq# 1 Cass|lcl|OU892280.1_cds_CAG9767834.1_5986 [locus_tag=CEUTPL_LOCUS8389] [protein_id=CAG9767834.1] [location=complement(20400831..20402546)] [gbkey=CDS]      1734 sites
reading seq# 2 Tcas|lcl|NC_007418.3_cds_XP_001812254.1_4585 [db_xref=GeneID:656905] [protein=SRSF protein kinase 1] [protein_id=XP_001812254.1] [location=complement(join(4772053..4772586,4775214..4776362))] [gbkey=CDS]      1734 sites
reading seq# 3 Dpon|lcl|NW_026018611.1_cds_XP_019769583.1_13918 [gene=LOC109544031] [db_xref=GeneID:109544031] [protein=SRSF protein kinase 2] [protein_id=XP_019769583.1] [location=complement(5272139..5273848)] [gbkey=CDS]      1734 sites
reading seq# 4 Sory|lcl|NW_022146996.1_cds_XP_030760502.1_17223 [gene=LOC115885665] [db_xref=GeneID:115885665] [protein=SRSF protein kinase 1] [protein_id=XP_030760502.1] [location=complement(5794500..5796164)] [gbkey=CDS]      1734 sites
reading seq# 5 Agra|lcl|NC_065547.1_cds_XP_050292688.1_2936 [gene=LOC126750744] [db_xref=GeneID:126750744] [protein=SRSF protein kinase 2] [protein_id=XP_050292688.1] [location=complement(join(29075850..29076844,29099051..29099762))] [gbkey=CDS]      1734 sites
reading seq# 6 Smad|g5339.t1                                         1734 sites
ns = 6          ls = 1734
Reading sequences, sequential format..
Reading seq # 1: Cass|lcl|OU892280.1_cds_CAG9767834.1_5986 [locus_t     

Error in sequence data file: U at 5 seq 1.
Make sure to separate the sequence from its name by 2 or more spaces.
kosukesano@at138:~/tools/for_paml/test$ 

空白入れたはずなのにエラー?

まっちゃん先輩の残したデータではヘッダー行に種名以外何もなかったな…。手動で種名以下を削除し、タブを追加してやってみる。

>Cass   
atgagctcgaaagtggacgtaaatcgacgtattctggctatccaggctaagaagaaacgc
cataagcccaacaagaagaaaggcaagaacgataatatgaatggacatggggagaatcgg
atccgt---tcgaaaaacgagccttcccacagttccagcaatgagactatcgaggacccg
gatacaccgtatacaagtgatgaagaagaacaagaggacagcaccgattatcagaaggga
ggataccaccccgtcaagattggcgacctctttcttggaaggtatcatgtcactagaaaa
ttaggttggggtcatttttccactgtttggctttgctgggatctcgaagacaaacgattt
gtagctttaaaaattgtaaaatcagctaaacatttcactgaaactgctttggatgaaatc
aaaatcctcagatcagtccgcgactctgatccacaagaccccaaaaggaacaaaacagtc
caacttctgaatgatttcaaaataagtggggttaatggggtgcatgtgtgcatggtcttc
gaagttcttggtcatcatttattaaaacttataataaaatccaattaccgaggcatccca
ttggccaatgttcgtactataatgcgacaagttttagaaggtctagattatttacattca
aagtgcaaaataatccatacagacataaaaccagaaaatgtacttgtatgtgtctctgaa
gaatatattagacggcttgcttgtgaagcagccgaaatgcaccaattaggagttaaacta
ccaacttctcttataagcactgcacctccacaagaagcacctccccaaaaaatgagcaaa
aataaaaagaaaaaactcaaaaaaaaggctaagaggcaaaatgaacttctcaaaaaacaa
atggaacaaattatcgagattgaagaaaagaagaaagttagcaaagaaaatggtgatgtt
aatgatgatgttaatgatgatgatatagagtgtaataattgtacaaatgatgaagaagtc
gctaa---tgataaaattattaatggt---gtagatgagattggtggtggagaaaat---
atcccttgtgatgaacc---gtctattgctgaccctgttgtgataatgtctgaagatgac
tctccttctctaacttcaaaaagtgaaagtaaaatggaattagatccagcctttgttgaa
tgtgattttgaggtcaaaattgctgacctcggaaatgcttgttgggtcgacaaacatttc
acagaagacatccaaacaagacaatacagatctttggaagttctacttggtgctggctat
aatacttcagccgatatttggagcactgcttgcatggcctttgaattagccactggagac
tatttatttgaaccacattctggagaagattattgcagagatgaagaccatttagcccat
atcattgagttattgggaaacattccgaaaagaattgcccaaagtggaacaaattctaaa
ttatttttcaacaagaaaaatgaacttcgccatattacagggttgaaaccatggggtctt
gaagatgtgttgcaggaaaaatatgagtggccgcccaaaaatgcccgcgaatttgcaggc
ttcctgaaaccaatgttggactttgatccggacaaaagggccactgcagcagaatgtctg
aagcatccatggttgaacaataat---gaaccctcgctctctgtaggtgactga
>Tcas   
atgagcgcaaaattggacgtaaatagacgtgtcttagctatccaagctaaaaagaaacga
cataagccagctaagaaaaaaggtaagaacg---aaatgaacggccacggggaaaaccgg
atcaat---tcgaagaccgagccctcgcacagctccagcaatgagacgatcgaagaccag
gacgacccgtacacgagcgaggaggaggagcaagaagacagcaatgactaccggaaaggg
ggctaccatcctgtcaaaatcggggacctgttcctcaaccgctatcacgtcacgcggaag
ctgggctggggccacttctccaccgtgtggctgtgctgggacctgcaggaccggcggttc
gtggccctgaagatcgtcaaatcggccgaacacttcaccgaaacggcgcttgacgaaatc
aaaattttaaaagcggtgcgggagtccgaccccacggaccccaaacgcaacaagactgtc
cagttgttgaacgacttcaagatcagcggaatcaacggcgtgcacgtgtgcatggtcttt
gaagtgcttggccaccacctgttaaagctaattatcaaatcgaactaccgagggatccct
ctggacaacgtccgcacaatcatgcggcaggttctggaaggtctcgactatttgcatacg
aaatgtaaaataatccacaccgatatcaagcccgaaaacgtcctgatttgtgttagtgaa
gagtatatcaggaggctggcgtgcgaggcggcggaaatgcaccatctaggcttaaaatta
cccacgtctcttataagcaccgcaccggtccaggaagtacaagcgtcgaaaatgagcaaa
aacaagaagaagaagctgaagaagaaggcgaaacgacttaatgagctacttaaacggcag
atggagcaaatcatagagattgaggagcagaagaaggt---gaaggaaaacggcgatgtg
gcgactgataacgactgcaatggaactagt---ccgagtc---ccgagacgacgcccgag
ggccccgaagacaaactctccaacggttgccttgacgaactcgccgggggcgag-

これを使ってPAMLを実行。実行結果は以下の通り。

kosukesano@at138:~/tools/for_paml/test$ singularity exec -e /usr/local/biotools/p/paml:4.9--h779adbc_6 codeml 240805_bsA.ctl 

 15         verbose | verbose                1.00
  7         runmode | runmode                0.00
  4         seqtype | seqtype                2.00
 13       CodonFreq | CodonFreq              2.00
  9           clock | clock                  0.00
 16           model | model                  2.00
 20         NSsites | NSsites                0.00
 26       fix_omega | fix_omega              0.00
 27           omega | omega                  1.00
 22           icode | icode                  0.00
 24       fix_kappa | fix_kappa              0.00
 25           kappa | kappa                  2.00
 28       fix_alpha | fix_alpha              1.00
 29           alpha | alpha                  0.00
 30          Malpha | Malpha                 0.00
 31           ncatG | ncatG                  4.00
 11           getSE | getSE                  0.00
 12    RateAncestor | RateAncestor           0.00
  8          method | method                 0.00
 37     fix_blength | fix_blength            0.00
AAML in paml version 4.9, March 2015

processing fasta file
reading seq# 1 Cass                                                      1734 sites
reading seq# 2 Tcas                                                      1734 sites
reading seq# 3 Dpon                                                      1734 sites
reading seq# 4 Sory                                                      1734 sites
reading seq# 5 Agra                                                      1734 sites
reading seq# 6 Smad                                                      1734 sites
ns = 6          ls = 1734
Reading sequences, sequential format..
Reading seq # 1: Cass     
Reading seq # 2: Tcas     
Reading seq # 3: Dpon     
Reading seq # 4: Sory     
Reading seq # 5: Agra     
Reading seq # 6: Smad     

Sequences read..
Counting site patterns..  0:00
         477 patterns at     1734 /     1734 sites (100.0%),  0:00
Counting frequencies..

      120 bytes for distance
   152640 bytes for conP
        0 bytes for fhK
  5000000 bytes for space

Species 97?
kosukesano@at138:~/tools/for_paml/test$ less result/OG0008033_branch_alt_240805 
kosukesano@at138:~/tools/for_paml/test$ ls
240805_bsA.ctl  2NG.dN  2NG.dS  2NG.t  bsA.ctl  bsAtest.sh  bsAtest.sh.e26312072  bsAtest.sh.o26312072  data  lnf  result  rst  rst1  rub  seq_space_plus.py
kosukesano@at138:~/tools/for_paml/test$

結果のファイルは以下の通り

###~/tools/for_paml/test/result/OG0008033_branch_alt_240805の中身

Homogeneity statistic: X2 = 0.07874 G = 0.07932 

Average                        0.32401 0.00000 0.00000 0.00000 0.20852 0.00000 0.00000 0.24080 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.22667 0.00000 0.00000 0.00000
(Ambiguity characters are used to calculate freqs.)


# constant sites:    917 (52.88%)
AA distances (raw proportions of different sites)

Cass           
Tcas             0.3224
Dpon             0.2463  0.2912
Sory             0.2520  0.2993  0.2566
Agra             0.2607  0.2895  0.2641  0.2785
Smad             0.2163  0.2780  0.2174  0.2347  0.2480

できてる!

CDSを取り直したので、MAFFTをかける。また、ついでにヘッダーの処理も行う。使用したスクリプトは以下の通り。

###~/tools/for_paml/6sp/data/mafft_plusname.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l gpu


source /home/kosukesano/tools/pyenv_env/ManualPhilo_profile

# 入力ディレクトリと出力ディレクトリのパスを設定
input_dir="/home/kosukesano/tools/for_paml/6sp/data/SCO_plusname/"
output_dir="/home/kosukesano/tools/for_paml/6sp/data/SCO_plusname/"

# 各ファイルに対してアラインメントを実行
for file in "$input_dir"*.fasta; do
  # 元のファイル名から拡張子を除いたものを取得
  base_name=$(basename "$file" .fasta)

  # 出力ファイル名を生成
  output_file="${output_dir}${base_name}_maffted.fasta"

  # MAFFTを実行
  mafft --auto --maxiterate 1000 --localpair "$file" > "$output_file"

  echo "Aligned file created: $output_file"

  # ヘッダーを整形
  awk '/^>/ {split($0, a, "|"); print ">" a[1] "\t"} !/^>/ {print}' "$output_file" > "${output_dir}${base_name}_maffted_fixed.fasta"

  echo "Fixed headers for file: ${output_dir}${base_name}_maffted_fixed.fasta"
done
~

PAMLの再帰的な実行

まずブランチサイトモデルについて行う。~/tools/for_paml/6sp/bsAディレクトリを作成、その中で以下のスクリプトを用意した。

  • run_paml.sh
  • template.ctl
### run_paml.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l gpu

# ディレクトリの設定
input_dir="/home/kosukesano/tools/for_paml/6sp/data/SCO_plusname"
bsA_dir="/home/kosukesano/tools/for_paml/6sp/bsA"
result_dir="$bsA_dir/result"
template_ctl="$bsA_dir/template.ctl"

# 出力ディレクトリが存在しない場合は作成
mkdir -p "$result_dir"

# テンプレートの制御ファイルを読み込む
ctl_template=$(cat "$template_ctl")

# ディレクトリ内の_maffted_fixed.fastaファイルを処理
for file in "$input_dir"/*_maffted_fixed.fasta; do
  if [[ -f "$file" ]]; then
    base_name=$(basename "$file" .fasta)
    outfile_path="$result_dir/${base_name}_branch_alt"

    # 一時的な制御ファイルの内容を生成
    ctl_content="${ctl_template//<SEQFILE>/$file}"
    ctl_content="${ctl_content//<OUTFILE>/$outfile_path}"

    # 一時的な制御ファイルを作成
    ctl_path="$bsA_dir/bsA.ctl"
    echo "$ctl_content" > "$ctl_path"

    # PAMLを実行
    singularity exec -e /usr/local/biotools/p/paml:4.9--h779adbc_6 codeml "$ctl_path"

    echo "Processed file: $file, output: $outfile_path"
  fi
done
### template.ctlの中身

seqfile = <SEQFILE>
treefile = /home/kosukesano/tools/for_paml/test/data/Orthofinder_tree_convert.nwk
outfile = <OUTFILE>

noisy = 9
verbose = 1
runmode = 0
seqtype = 1
CodonFreq = 2
clock = 0
model = 2
NSsites = 2
fix_omega = 0
omega = 1
icode = 0
fix_kappa = 0
kappa = 2
fix_alpha = 1
alpha = .0
Malpha = 0
ncatG = 4
getSE = 0
RateAncestor = 0
method = 0
fix_blength = 0

また、/home/kosukesano/tools/for_paml/test/data/ディレクトリにOrthofinder_tree_convert.nwkを作成した。

### Orthofinder_tree_convert.nwkの中身

(Tcas:0.177097,(Sory:0.19234,((Dpon:0.181257,(Cass:0.179451,Smad#1:0.145856):0.0217713):0.0202496,Agra:0.176422):0.0384627):0.177097);

これは元々Orthofinder出力のSpecies_Tree/SpeciesTree_rooted.txt

### SpeciesTree_rooted.txtの中身

(Tcas:0.177097,(Sory:0.19234,((Dpon:0.181257,(Cass:0.179451,Smad:0.145856)0.253106:0.0217713)0.232305:0.0202496,Agra:0.176422)0.60296:0.0384627)1:0.177097);

何か変な数字(ブートストラップ値?)がついていたので、手動で除外したのがSpecies_Tree/SpeciesTree_rooted.txt

PAMLの実行自体はrun_paml.shqsubで投げることで行った。

続いてブランチサイトの帰無仮説の方。~/tools/for_paml/6sp/bs_nullディレクトリを作成、その中で以下のスクリプトを用意した。

  • bsN_run_paml.sh
  • bsN_template.ctl
###bsN_run_paml.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l gpu

# ディレクトリの設定
input_dir="/home/kosukesano/tools/for_paml/6sp/data/SCO_plusname"
bsA_dir="/home/kosukesano/tools/for_paml/6sp/bs_null"
result_dir="$bsA_dir/result"
template_ctl="$bsA_dir/bsN_template.ctl"

# 出力ディレクトリが存在しない場合は作成
mkdir -p "$result_dir"

# テンプレートの制御ファイルを読み込む
ctl_template=$(cat "$template_ctl")

# ディレクトリ内の_maffted_fixed.fastaファイルを処理
for file in "$input_dir"/*_maffted_fixed.fasta; do
  if [[ -f "$file" ]]; then
    base_name=$(basename "$file" .fasta)
    outfile_path="$result_dir/${base_name}_branch_alt_null"

    # 一時的な制御ファイルの内容を生成
    ctl_content="${ctl_template//<SEQFILE>/$file}"
    ctl_content="${ctl_content//<OUTFILE>/$outfile_path}"

    # 一時的な制御ファイルを作成
    ctl_path="$bsA_dir/bsA.ctl"
    echo "$ctl_content" > "$ctl_path"

    # PAMLを実行
    singularity exec -e /usr/local/biotools/p/paml:4.9--h779adbc_6 codeml "$ctl_path"

    echo "Processed file: $file, output: $outfile_path"
  fi
done
###bsN_template.ctlの中身

seqfile = <SEQFILE>
treefile = /home/kosukesano/tools/for_paml/test/data/Orthofinder_tree_convert.nwk
outfile = <OUTFILE>

noisy = 9
verbose = 1
runmode = 0
seqtype = 1
CodonFreq = 2
clock = 0
model = 2
NSsites = 2
fix_omega = 1
omega = 1
icode = 0
fix_kappa = 0
kappa = 2
fix_alpha = 1
alpha = .0
Malpha = 0
ncatG = 4
getSE = 0
RateAncestor = 0
method = 0
fix_blength = 0

これも同様にqsubで投げた。

0806

PAML出力のまとめ、尤度比検定

昨日のジョブがうまくいって終わったので、結果を比較する。尤度比検定用のスクリプトは以下の通り。

###~/tools/for_paml/6sp/bs_lrp.pyの中身

import os
import re
from scipy.stats import chi2

def parse_lnL(file_path):
    with open(file_path, 'r') as f:
        for line in f:
            match = re.search(r'lnL\(ntime: \d+  np: (\d+)\):\s+(-?\d+\.\d+)', line)
            if match:
                np = int(match.group(1))
                lnL = float(match.group(2))
                return np, lnL
    return None, None

def perform_lrt(alt_lnL, alt_np, null_lnL, null_np):
    lr_stat = 2 * (alt_lnL - null_lnL)
    df = alt_np - null_np
    p_val = chi2.sf(lr_stat, df)
    return p_val

def main():
    alt_dir = '~/tools/for_paml/6sp/bsA/result'
    null_dir = '~/tools/for_paml/6sp/bs_null/result'
    output_file = 'branch_site_lrt_results.txt'

    alt_dir = os.path.expanduser(alt_dir)
    null_dir = os.path.expanduser(null_dir)

    og_files = [f for f in os.listdir(alt_dir) if '_maffted_fixed_branch_alt' in f]

    with open(output_file, 'w') as out_f:
        out_f.write('OG_num\tp_val\tpositive_selection\n')

        for og_file in og_files:
            og_num = og_file.split('_')[0]
            alt_file = os.path.join(alt_dir, og_file)
            null_file = os.path.join(null_dir, og_file.replace('_maffted_fixed_branch_alt', '_maffted_fixed_branch_alt_null'))

            if os.path.exists(null_file):
                alt_np, alt_lnL = parse_lnL(alt_file)
                null_np, null_lnL = parse_lnL(null_file)

                if alt_np is not None and null_np is not None:
                    p_val = perform_lrt(alt_lnL, alt_np, null_lnL, null_np)
                    reject_null = '+' if p_val < 0.05 else '-'
                    out_f.write(f'{og_num}\t{p_val}\t{reject_null}\n')

if __name__ == "__main__":
    main()

結果は~/tools/for_paml/6sp/branch_site_lrt_results.txtとして出力された。

### ~/tools/for_paml/6sp/branch_site_lrt_results.txtの中身

G_num  p_val   positive_selection
OG0008991       1.0     -
OG0008220       1.0     -
OG0009516       0.02359893128372939     +
OG0009076       1.0     -
OG0009448       1.0     -
OG0010062       1.0     -
OG0009276       1.0     -
OG0009923       1.0     -
OG0009794       0.27039658785634013     -
OG0008669       1.0     -
OG0009998       0.5081029121888809      -
OG0009946       0.8522214828985033      -
OG0009111       0.6973912229476 -
OG0009787       1.0     -
OG0008626       0.2573023006043861      -
OG0009987       1.0     -
OG0009641       0.5390711466832219      -
OG0009570       1.0     -
OG0008058       1.0     -
OG0009949       0.34527143821093376     -
OG0008996       0.705351995899238       -
OG0008847       1.0     -
OG0009812       1.0     -
OG0009515       0.42550751108380647     -
OG0009410       1.0     -
OG0008868       0.28057388758663304     -
.
.
.
.
.
.

これをローカル環境に持っていき、遺伝子機能のファイルと照合した。

###~/bio/for_cafe/caferesult.Rの一部


###############################################################
#マダラの遺伝子番号とOGの紐付け

df4 <- orthogroups %>%###
  separate_rows(V5, sep = ", ") %>%
  rename(gene_ID = V5, family_ID = V1)|>
  print()

View(df4)

df5=dplyr::left_join(df4, fa, by = c(gene_ID = "Madara"))|>###完成系
  print()


#################################################################
paml<-read.csv("branch_site_lrt_results.txt", sep="\t")
print(paml)

paml_po=paml|>
  dplyr::filter(positive_selection == "+") |>
  print()

df6=dplyr::inner_join(paml_po, df5, by = c(OG_num = "family_ID"))|>
  print()

View(df6)

正の選択が検出された遺伝子は15個であった。

0814

PAMLの多重検定とp値の補正

多重検定における尤度比の補正を行った。遺伝研が不具合でログインできなかったため、ローカルで実行。

まずpaml用のディレクトリを作成し、その中にbranch_site_lrt_results.txtをコピー。

:~/bio$ mkdir for_paml
:~/bio$ ls
DEG_Adult_vs_Larva_DESeq2.csv SRR11742112_2.fastq           SRR9665770_report1.html       for_cafe                      madara_annotated              qc_SRR9665770_2.fq
DEG_ovary_vs_body_DESeq2.csv  SRR9665770                    fastp.json                    for_eggnoc                    memo.txt
SRR11742112                   SRR9665770_1.fastq            femo_annotated                for_paml                      new_rbh.py
SRR11742112_1.fastq           SRR9665770_2.fastq            for_blast_test                functional_annotation         qc_SRR9665770_1.fq
:~/bio$ cd for_paml/
:~/bio/for_paml$ pwd
/Users/kosukesano/bio/for_paml
:~/bio/for_paml$ cd ../for_cafe/
:~/bio/for_cafe$ ls
CAFE_plus_gene.csv          ManualPhylo_2.py            Rplot01.png                 caferesult.R                for_sinkagakkai.png
DEG_CAFE_adult_vs_larva.csv ManualPhylo_3.py            Rplot02.png                 caferesult_6sp.png          ogfil.py
DEG_CAFE_ovary_vs_body.csv  Original_data               ThroughoutCAFE.R            caferesult_6sp_iqtree.png   old_result
Deg                         Processed_data              branch_site_lrt_results.txt cleaned_orthogroups.tsv     out_madara_SP.txt
ManualPhylo_1.py            Rplot.png                   bs_positive_gene.csv        for_cafe.Rproj              tree_ultrametric.nwk
:~/bio/for_cafe$ cp branch_site_lrt_results.txt ../for_paml/
:~/bio/for_cafe$ cd ../for_paml/
:~/bio/for_paml$ ls
branch_site_lrt_results.txt
:~/bio/for_paml$

次にPythonの仮想環境paml_hoseiを作成。

:~/bio/for_paml$ python3 -m venv paml_hosei
:~/bio/for_paml$ source paml_hosei/bin/activate
(paml_hosei) :~/bio/for_paml$ pip install pandas

Collecting pandas
  Downloading pandas-2.2.2-cp312-cp312-macosx_10_9_x86_64.whl.metadata (19 kB)
Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.0.1-cp312-cp312-macosx_10_9_x86_64.whl.metadata (60 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 60.9/60.9 kB 4.1 MB/s eta 0:00:00
Collecting python-dateutil>=2.8.2 (from pandas)
  Downloading python_dateutil-2.9.0.post0-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting six>=1.5 (from python-dateutil>=2.8.2->pandas)
  Downloading six-1.16.0-py2.py3-none-any.whl.metadata (1.8 kB)
Downloading pandas-2.2.2-cp312-cp312-macosx_10_9_x86_64.whl (12.5 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.5/12.5 MB 58.6 MB/s eta 0:00:00
Downloading numpy-2.0.1-cp312-cp312-macosx_10_9_x86_64.whl (21.0 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 21.0/21.0 MB 39.4 MB/s eta 0:00:00
Downloading python_dateutil-2.9.0.post0-py2.py3-none-any.whl (229 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 229.9/229.9 kB 18.1 MB/s eta 0:00:00
Downloading pytz-2024.1-py2.py3-none-any.whl (505 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 505.5/505.5 kB 20.4 MB/s eta 0:00:00
Downloading tzdata-2024.1-py2.py3-none-any.whl (345 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 345.4/345.4 kB 34.5 MB/s eta 0:00:00
Downloading six-1.16.0-py2.py3-none-any.whl (11 kB)
Installing collected packages: pytz, tzdata, six, numpy, python-dateutil, pandas
Successfully installed numpy-2.0.1 pandas-2.2.2 python-dateutil-2.9.0.post0 pytz-2024.1 six-1.16.0 tzdata-2024.1

[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: pip install --upgrade pip
(paml_hosei) :~/bio/for_paml$ 

次に実行スクリプトのhosei.pyを作成。

### hosei.pyの中身

import pandas as pd
from statsmodels.stats.multitest import multipletests

# 入力ファイルと出力ファイルのパス
input_file = '/Users/kosukesano/bio/for_paml/branch_site_lrt_results.txt'
output_file = '/Users/kosukesano/bio/for_paml/hosei_branch_site_lrt_results.txt'

# ファイルを読み込む
df = pd.read_csv(input_file, sep='\t')

# p値をリストに変換
p_values = df['p_val'].tolist()

# FDR制御を使用してp値を補正
rejected, q_values, _, _ = multipletests(p_values, alpha=0.05, method='fdr_bh')

# 補正後のq値と有意差をデータフレームに追加
df['q_val'] = q_values
df['significant'] = rejected

# 結果を新しいファイルに出力
df.to_csv(output_file, sep='\t', index=False)

print(f"補正後の結果が{output_file}に保存されました。")

実行

(paml_hosei) :~/bio/for_paml$ python hosei.py
補正後の結果が/Users/kosukesano/bio/for_paml/hosei_branch_site_lrt_results.txtに保存されました。
(paml_hosei) :~/bio/for_paml$ 

結果

3つの遺伝子で有意差が認められた。 - g4236.t1 acyl-CoA dehydrogenase family member 9, mitochondrial - g9945.t1 laminin subunit alpha - g12267.t1 D-glucuronyl C5-epimerase B

2024年9月

0903

ASTRALの実行

ASTRAL.sh.o26291746の記述は以下の通り。

start at
2024年  7月 31日 水曜日 23:02:59 JST
Error occurred during initialization of VM
Could not reserve enough space for 629145600KB object heap
2024年  7月 31日 水曜日 23:03:00 JST

ヒープサイズ?を設定する値が大きすぎてエラーが出ているっぽい。

ASTRAL.shを見ると確かに-Xmx629145600Kとなっていた。これを以下のように変更。

### ASTRAL.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 16
echo start at
date

java -Xmx2G -jar astral.5.7.8.jar \
    -i /home/kosukesano/tools/for_ASTRAL/Astral/data/modified_trees.nwk \
    -o /home/kosukesano/tools/for_ASTRAL/Astral/240903_result/out.tre \
    2>/home/kosukesano/tools/for_ASTRAL/Astral/240903_result/out.log

date

-Xmx2Gにした。ついでに出力ファイルも240903_resultとした。

kosukesano@at139:~/tools/for_ASTRAL/Astral$ mkdir 240903_result

240903_resultを作成したのち、ASTRAL.shqsubで投げた。

結果

kosukesano@at139:~/tools/for_ASTRAL/Astral$ ls 240903_result/
out.log  out.tre

2つのファイルが出力された。それぞれのファイルの中身は以下の通り。

### out.logの中身

================== ASTRAL ===================== 

This is ASTRAL version 5.7.8
Gene trees are treated as unrooted
1518 trees read from /home/kosukesano/tools/for_ASTRAL/Astral/data/modified_trees.nwk
index0
All output trees will be *arbitrarily* rooted at Agra

======== Running the main analysis
Number of taxa: 6 (6 species)
Taxa: [Agra, Cass, Dpon, Smad, Sory, Tcas]
Taxon occupancy: {Cass=1518, Sory=1518, Tcas=1518, Agra=1518, Smad=1518, Dpon=1518}
Number of gene trees: 1518
0 trees have missing taxa
Calculating quartet distance matrix (for completion of X)
Species tree distances calculated ...
Building set of clusters (X) from gene trees 
------------------------------
gradient0: 63
Number of Clusters after addition by distance: 63
calculating extra bipartitions to be added at level 1 ...
Adding to X using resolutions of greedy consensus ...
Limit for sigma of degrees:200
polytomy size limit : 4
discarded polytomies:  [3, 4]
Threshold 0.0:
Threshold 0.01:
Threshold 0.02:
Threshold 0.05:
Threshold 0.1:
Threshold 0.2:
polytomy of size 3; rounds with additions with at least 5 support: 0; clusters: 63
Threshold 0.3333333333333333:
polytomy of size 4; rounds with additions with at least 5 support: 0; clusters: 63
max k is :0
Number of Clusters after addition by greedy: 63
gradient0 in heuristiic: 63
partitions formed in 0.477 secs
Dynamic Programming starting after 0.477 secs
Using tree-based weight calculation.
Using polytree-based weight calculation.
Polytree max score: 22770
Polytree building time: 0.061 seconds.
Number of quartet trees in the gene trees: 22770
Size of largest cluster: 6
Greedy score: 11289
estimationFactor: 2.017007706617061
Sub-optimal score: 11289
Total Number of elements weighted: 136
Normalized score (portion of input quartet trees satisfied before correcting for multiple individuals): 0.4957839262187088
Optimization score: 11289
Optimal tree inferred in 0.694 secs.
(Cass,(Smad,(Dpon,(Agra,(Sory,Tcas)))));
Final quartet score is: 11289
Final normalized quartet score is: 0.4957839262187088
Extended species tree:
(Agra,((Sory,Tcas)1:0.7104118856165281,(Dpon,(Smad,Cass)0.63:0.017264705056582508)1:0.07499457563408858));
Weight calculation took 0.024812803 secs
ASTRAL finished in 1.732 secs
### out.treの中身

(Agra,((Sory,Tcas)1:0.7104118856165281,(Dpon,(Smad,Cass)0.63:0.017264705056582508)1:0.07499457563408858):0.0); 

これをもとに書いた系統樹は以下の通り。

tree = read.tree("/Users/kosukesano/bio/240903_ASTRAL.tre")

p=ggtree(tree)+
  xlim(0, 7)+
  theme(text = element_text(face = "italic"))+
  geom_tiplab(fontface = 4, linesize=3.0) + # Make tip labels italic
  geom_nodelab(hjust = -0.2, node = "internal", size = 5) +
  #geom_text(aes(label=node), hjust=-.2)+
  theme_tree()

p

A. grandisが最外群になってる〜!

out.txtの途中に出てた枝長などがない系統樹をコピーして~bio/240903_ASTRAL_Optimal_tree.treとして保存し描画してみる。

tree2 = read.tree("/Users/kosukesano/bio/240903_ASTRAL_Optimal_tree.tre")
p=ggtree(tree2)+
  xlim(0, 7)+
  theme(text = element_text(face = "italic"))+
  geom_tiplab(fontface = 4, linesize=3.0) + # Make tip labels italic
  geom_nodelab(hjust = -0.2, node = "internal", size = 5) +
  #geom_text(aes(label=node), hjust=-.2)+
  theme_tree()

p

一応Tcasが最外群にはなっている?

過去の系統樹はこちら

OrthoFinderとIQ-TREEで描いた系統樹

Dmelanogasterを加えた系統樹作成、それの準備としてのOrthofinder

外群が外群として機能していないので、一度絶対に外群だろうという昆虫(今回はキイロショウジョウバエ)を加えて系統樹を描いてみる。そのためにまずOrthoFinderを実行する。

~/tools/for_orthofinderディレクトリにSmad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dirディレクトリを作成。その中に7種の昆虫のアミノ酸配列ファイルを格納した。

kosukesano@at139:~/tools/for_orthofinder$ ls Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/
Agra.fasta  Cass.fasta  Dmel.fasta  Dpon.fasta  Smad.fasta  Sory.fasta  Tcas.fasta
kosukesano@at139:~/tools/for_orthofinder$ 

また、seven_sp.shを作成した。

### seven_sp.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -pe def_slot 5
#$ -l medium
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date


singularity exec /usr/local/biotools/o/orthofinder:2.5.4--hdfd78af_0 orthofinder -f ~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir -t 5 -a 5

date

これをqsubで投げた。

また、手動での系統樹作成アルゴリズムの1つである、ManualPhylo_1.pyを作成し実行する。

まず、~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03下にManualPhylo_dataディレクトリを作成した。

kosukesano@at139:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03$ mkdir ManualPhilo_data

0904

7種の昆虫ゲノムを使ったOrthoFinderが終わり、無事出力された。

OrthoFinder出力の系統樹はこんな感じ。

tree3 = read.tree("/Users/kosukesano/bio/7sp.tre")
p=ggtree(tree3, branch.length = 'none')+
  xlim(0,9)+
  theme(text = element_text(face = "italic"))+
  geom_tiplab(fontface = 4, linesize=3.0) + # Make tip labels italic
  geom_nodelab(hjust = -0.1, node = "internal", size = 5) +
  #geom_text(aes(label=node), hjust=-.2)+
  theme_tree()
p

7種の昆虫ゲノムのアミノ酸配列fastaファイルを統合し、1つのファイルにする。

~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03/make_philo_tree下でfasta_concatinate.shを作成した。fasta_concatinate.shの中身は以下の通り。

### fasta_concatinate.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -pe def_slot 5
#$ -l medium
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date

# Enter the directory containing the fasta files
filesout="/home/kosukesano/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir"  ## Please replace with the actual directory containing the fasta files

# Define the output directory and output file
new="output_directory"
mkdir -p $new

# Concatenate all fasta files into one file
for file in "$filesout"/*.fasta; do
    cat "$file" >> "./${new}/all_seq.fa"
done


date

これを作業ノードで実行権限を与えて実行した。

kosukesano@at139:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03/make_philo_tree$ chmod +x fasta_concatinate.sh
kosukesano@at139:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03/make_philo_tree$ ./fasta_concatinate.sh

結果として、~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03/make_philo_tree/output_directoryall_seq.faができた。

kosukesano@at139:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03/make_philo_tree/output_directory$ ls
all_seq.fa

続いて、~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03/make_philo_tree下でManualPhylo_2.pyを作成し実行した。

ManualPhylo_2.pyの中身は以下の通り。0718のManualPhylo_2.pyではOGがきちんと抽出できないので注意!

### ManualPhylo_2.pyの中身

##ManualPhylo_1.pyの続き

import sys
from Bio import SeqIO

path = "../ManualPhylo_data/"

fasta_in = sys.argv[1]                                  #1番目の引数には上記のall_seq.faなどfastaファイルを指定する
query_in = sys.argv[2]                                  #2番目の引数には上記のOG_list.txtなどオーソログファイルを指定する

for q in open(query_in, "r"):                                           #オーソログファイルを開いて1行づつ読み込む
        query = q.split()                                                       #スペース毎に切りとってリスト形式でqueryに保存する
        f = open(path + query[0], 'w')                                  #最初の列(OG名)と同じ名前のファイルを作成する
        for record in SeqIO.parse(fasta_in, 'fasta'):   #fastaファイルを開くSeqIOを使ってパースする(1項目づつ読み込む)
                id_part = record.id                                     #fastaのID部分を読み込む
                desc_part = record.description                  #fastaのdescription部分を読み込む
                seq = record.seq                                        #fastaの配列部分を読み込む
                for i in range(len(query)):                         #オーソログファイル中の各OGに含まれる配列数を数えて、その分繰り返す(python2の人はrange を x rangeにする)
                        if id_part == query[i] or desc_part == query[i] :                   #オーソログファイルの配列descriptionとfastaの配列descriptionが一致したら、、、
                                fasta_seq = '>' + id_part + ' ' + desc_part + '\n' + seq + '\n'         #fasta形式に整え
                                print(fasta_seq)                                        #標準出力にfastaを出力(進行状況把握用)
                                f.write(str(fasta_seq))                             #各OGファイルにfastaを出力
        f.close()

##できたOGファイルは、align.shやOG_list.txtと同じ場所に
##align.shのある場所までいき、作動。cwdを231016/ManualPhylo_dataにしないとtrimalが作動せず、イライラ

これにより630個のSCOが抽出された。

kosukesano@at139:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03/ManualPhylo_data$ ls -1 | wc -l
632

### ls -1 | wc -lでファイル数をカウント。OG_list.txtとspecies_list.txtがあるので-2する。

続いて、MAFFTtrimAlによってOGをアライメントする。

まずMAFFTtrimAlの環境を立ち上げる。

source ~/tools/pyenv_env/ManualPhilo_profile

続いて、~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03/ManualPhylo_data下にalign.shを作成、実行する。

align.shの中身は以下の通り。

### align.shの中身


#!/bin/sh
#$ -S /bin/bash
#$ -cwd
#$ -v PATH
awk '{print($1)}' $1 | while read x; do #引数に前述のOG_list.txtなどのOGリストを指定する。
    mafft --auto $x > $x.maffted.fa
    trimal -in $x.maffted.fa -out $x.maffted.trimed.fa -htmlout $x.maffted.trimed.fa.html -automated1
done

実行コマンドは以下の通り

(MPT) kosukesano@at138:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03/ManualPhylo_data$ sh align.sh OG_list.txt

続いて~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03/make_philo_treeにてManualPhylo_3.pyを作成、実行した。

ManualPhylo_3.pyの中身は以下の通り。

### ManualPhylo_3.pyの中身


##align.shした後

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys
from Bio import SeqIO

path = "../ManualPhylo_data/"

query_in = sys.argv[1]                            #1番目の引数には上記のOG_list.txtなどオーソログファイルを指定する
species_in = sys.argv[2]                          #2番目の引数にはOG_list.txtと同じ順番で種名を記述したファイルを指定する

sp_list = []
for sp in open(species_in, "r"):                     #種名ファイルを開く
        sp_list.extend(sp.split())                            #スペース毎に切りとってリスト形式でsp_listに保存する

for q in open(query_in, "r"):                     #オーソログファイルを開いて1行づつ読み込む
        query = q.split()                             #スペース毎に切りとってリスト形式でqueryに保存する

        f = open(path + query[0]+".maffted.trimed.edit.fa", 'w')
        for record in SeqIO.parse(path + query[0]+".maffted.trimed.fa", 'fasta'): #fastaファイルを開くSeqIOを使ってパースする(1項目づつ読み込む)
                desc_part = record.description            #fastaのdescription部分を読み込む
                seq = record.seq                          #fastaの配列部分を読み込む
                desc_part_new = desc_part.split()[0]

                for i in range(len(query)-1):              #オーソログファイル中の各OGに含まれる配列数を数えて、その分繰り返す

                        if desc_part_new == query[i+1] :            #オーソログファイルの配列descriptionとfastaの配列descriptionが一致したら、、、
                                fasta_seq = '>' + sp_list[i] + '\n' + seq + '\n'     #配列名を該当する種名に置き換えて、fasta形式に整えて
                                print(fasta_seq)                  #標準出力にfastaを出力(進行状況把握用)
                f.write(str(fasta_seq))           #各OGファイルにfastaを出力
        f.close()

##scorpionでnano run.nexを変更し、以下を実行
##iqtree –sp run.nex –nt AUTO –bb 1000
~

実行コマンド

(MPT) kosukesano@at138:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03/make_philo_tree$ python ManualPhylo_3.py ../ManualPhylo_data/OG_list.txt ../ManualPhylo_data/species_list.txt

続いて~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03/ManualPhylo_data下にmakealltree.shを作成、qsubでジョブとして投げた。

makealltree.shの中身は以下の通り。

### makealltree.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 16
echo start at
date

# Singularityイメージのパスを指定
SINGULARITY_IMAGE="/usr/local/biotools/i/iqtree:2.3.3--h21ec9f0_0"

# 作業ディレクトリに移動
cd ~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03/ManualPhylo_data
# 出力ファイル

output_file="all_trees.nwk"

# 既存の出力ファイルを削除
if [ -f $output_file ]; then
    rm $output_file
fi

# *.maffted.trimed.edit.fa ファイルを処理
for file in *.maffted.trimed.edit.fa; do
    # ファイル名から拡張子を除いたベース名を取得
    base_name=$(basename $file .maffted.trimed.edit.fa)

    # Singularityを使用してIQ-TREEを実行して系統樹を作成
    singularity exec -e $SINGULARITY_IMAGE iqtree2 -s $file -nt AUTO -bb 1000 -cptime 600 -pre ${base_name}

    # 作成された系統樹ファイル (.treefile) を output_file に追加
    if [ -f ${base_name}.treefile ]; then
        echo -n "${base_name}: " >> $output_file
        cat ${base_name}.treefile >> $output_file
        echo "" >> $output_file
    else
        echo "Error: ${base_name}.treefile not found" >&2
    fi
done

echo "All trees have been written to $output_file"

date

0905

昨日のIQ-TREEはうまくいっており、最終出力のall_trees.nwkも出力されていた。

(MPT) kosukesano@at138:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03/ManualPhylo_data$ ls all_trees.nwk 
all_trees.nwk

これをASTRAL用のディレクトリにコピー。

~/tools/for_ASTRAL/Astral/dataには以前のインプットファイルがあったため、6sp7spというディレクトリを作りそこに格納。

kosukesano@at137:~/tools/for_ASTRAL/Astral/data$ ls
all_trees.nwk  modified_trees.nwk  modify.py
kosukesano@at137:~/tools/for_ASTRAL/Astral/data$ mkdir 6sp
kosukesano@at137:~/tools/for_ASTRAL/Astral/data$ ls
6sp  all_trees.nwk  modified_trees.nwk  modify.py
kosukesano@at137:~/tools/for_ASTRAL/Astral/data$ mv *.nwk 6sp
kosukesano@at137:~/tools/for_ASTRAL/Astral/data$ ls
6sp  modify.py
kosukesano@at137:~/tools/for_ASTRAL/Astral/data$ mkdir 7sp
kosukesano@at137:~/tools/for_ASTRAL/Astral/data$ ls
6sp  7sp  modify.py
kosukesano@at137:~/tools/for_ASTRAL/Astral/data$

7spの方に今回のall_trees.nwkをコピーし、modify.pyでOG番号を切り取った。

modify.pyの中身は以下の通り。

### modify.pyの中身

# 元のファイルと新しいファイルのパスを設定
input_file_path = '7sp/all_trees.nwk'
output_file_path = '7sp/modified_trees.nwk'

# 元のファイルを開いて処理
with open(input_file_path, 'r') as infile, open(output_file_path, 'w') as outfile:
    for line in infile:
        # 行を ': ' で分割し、要素が2つ以上の場合のみ処理
        parts = line.split(': ', 1)
        if len(parts) > 1:
            modified_line = parts[1]
            # 新しいファイルに書き込み
            outfile.write(modified_line)

これを使ってASTRALを実行。使ったスクリプトは以下の通り。

### ASTRAL.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 16
echo start at
date

java -Xmx2G -jar astral.5.7.8.jar \
    -i /home/kosukesano/tools/for_ASTRAL/Astral/data/7sp/modified_trees.nwk \
    -o /home/kosukesano/tools/for_ASTRAL/Astral/240905_result/out.tre \
    2>/home/kosukesano/tools/for_ASTRAL/Astral/240905_result/out.log

date

これをqsubで投げた。

欠失1つを許したASTRALの作成

~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/Orthogroups下でSCOwiith0tyusyutu.pyを作成、実行した。

### SCOwiith0tyusyutu.pyの中身


import pandas as pd

# ファイルを読み込む
file_path = "Orthogroups.GeneCount.tsv"
df = pd.read_csv(file_path, sep="\t")

# 各列が1か0で、1つの列が0で他が全て1の行を抽出
filtered_df = df[
    ((df['Agra'] == 1) & (df['Cass'] == 1) & (df['Dpon'] == 1) & (df['Smad'] == 1) & (df['Sory'] == 1) & (df['Tcas'] == 1)) |
    ((df['Agra'] == 0) & (df['Cass'] == 1) & (df['Dpon'] == 1) & (df['Smad'] == 1) & (df['Sory'] == 1) & (df['Tcas'] == 1)) |
    ((df['Agra'] == 1) & (df['Cass'] == 0) & (df['Dpon'] == 1) & (df['Smad'] == 1) & (df['Sory'] == 1) & (df['Tcas'] == 1)) |
    ((df['Agra'] == 1) & (df['Cass'] == 1) & (df['Dpon'] == 0) & (df['Smad'] == 1) & (df['Sory'] == 1) & (df['Tcas'] == 1)) |
    ((df['Agra'] == 1) & (df['Cass'] == 1) & (df['Dpon'] == 1) & (df['Smad'] == 0) & (df['Sory'] == 1) & (df['Tcas'] == 1)) |
    ((df['Agra'] == 1) & (df['Cass'] == 1) & (df['Dpon'] == 1) & (df['Smad'] == 1) & (df['Sory'] == 0) & (df['Tcas'] == 1)) |
    ((df['Agra'] == 1) & (df['Cass'] == 1) & (df['Dpon'] == 1) & (df['Smad'] == 1) & (df['Sory'] == 1) & (df['Tcas'] == 0))
]

# Orthogroup 列のみを抽出
filtered_df = filtered_df[['Orthogroup']]

# 抽出したデータを新しいファイルに保存 (ヘッダー削除) 拡張子を .txt に変更
output_file_path = "Orthogroups.GeneCount.SingleCopyWithOneZeroOrtholog.txt"
filtered_df.to_csv(output_file_path, sep="\t", index=False, header=False)


print(f"抽出されたデータが {output_file_path} に保存されました。")

~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25下にSCOwithOneZero_Manualphylo_dataを作成、Manualphylo_1,2とalign.shを実行した。

kosukesano@at137:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data$ ls ManualPhylo_*
ManualPhylo_1.py  ManualPhylo_2.py  ManualPhylo_3.py
### ManualPhylo_1.pyの実行
(MPT) kosukesano@at138:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data$ python ManualPhylo_1.py 

### ManualPhylo_2.pyの実行
(MPT) kosukesano@at138:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data$ python ManualPhylo_2.py ~/tools/for_orthofinder/make_philo_tree/output_directory/all_seq.fa OG_list.txt

### align.shの実行
(MPT) kosukesano@at138:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data$ sh align.sh OG_list.txt

0907

~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data下でManualphylo_3.pyを実行した。

(MPT) kosukesano@at138:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data$ python3 ManualPhylo_3.py OG_list.txt species_list.txt

同じディレクトリにmakealltree.shをコピーし、パスを書き換えて実行した。

(MPT) kosukesano@at138:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data$ cp ~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03/ManualPhylo_data/makealltree.sh ../SCOwithOneZero_Manualphylo_data/
(MPT) kosukesano@at138:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data$ ls makealltree.sh 
makealltree.sh
(MPT) kosukesano@at138:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data$ less makealltree.sh 
(MPT) kosukesano@at138:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data$ nano makealltree.sh 
(MPT) kosukesano@at138:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data$ qsub_beta makealltree.sh 
Your job 26698378 ("makealltree.sh") has been submitted

書き換えたmakealltree.shは以下の通り。

### makealltreeの中身

#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 16
echo start at
date

# Singularityイメージのパスを指定
SINGULARITY_IMAGE="/usr/local/biotools/i/iqtree:2.3.3--h21ec9f0_0"

# 作業ディレクトリに移動
cd ~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data
# 出力ファイル

output_file="all_trees.nwk"

# 既存の出力ファイルを削除
if [ -f $output_file ]; then
    rm $output_file
fi

# *.maffted.trimed.edit.fa ファイルを処理
for file in *.maffted.trimed.edit.fa; do
    # ファイル名から拡張子を除いたベース名を取得
    base_name=$(basename $file .maffted.trimed.edit.fa)

    # Singularityを使用してIQ-TREEを実行して系統樹を作成
    singularity exec -e $SINGULARITY_IMAGE iqtree2 -s $file -nt AUTO -bb 1000 -cptime 600 -pre ${base_name}

    # 作成された系統樹ファイル (.treefile) を output_file に追加
    if [ -f ${base_name}.treefile ]; then
        echo -n "${base_name}: " >> $output_file
        cat ${base_name}.treefile >> $output_file
        echo "" >> $output_file
    else
        echo "Error: ${base_name}.treefile not found" >&2
    fi
done

echo "All trees have been written to $output_file"

date

0908

前日のジョブをまちがって上書きしてしまった……。

すべてqdelして改めてqsub

0909

IQ-TREEのジョブは無事できていた。最終出力のall_trees.nwkASTRAL用のディレクトリにコピー。

~/tools/for_ASTRAL/Astral/data/6sp_withOneZeroというディレクトリを作りそこに格納。modify.pyで OG番号を切り取った。

これを使ってASTRALを実行。使ったスクリプトは以下の通り。

### ASTRAL.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 16
echo start at
date

java -Xmx2G -jar astral.5.7.8.jar \
    -i /home/kosukesano/tools/for_ASTRAL/Astral/data/6sp_withOneZero/modified_trees.nwk \
    -o /home/kosukesano/tools/for_ASTRAL/Astral/240909_result/out.tre \
    2>/home/kosukesano/tools/for_ASTRAL/Astral/240909_result/out.log

date

0910

240910_resultを作ってなかったのでエラー……。

mkdirで作り直してもう一度動かす。ついでに作業ノードで実行権限を付与して行ってみる。

kosukesano@at138:~/tools/for_ASTRAL/Astral$ mkdir 240910_result
kosukesano@at138:~/tools/for_ASTRAL/Astral$ nano ASTRAL.sh
kosukesano@at138:~/tools/for_ASTRAL/Astral$ chmod +x ASTRAL.sh
kosukesano@at138:~/tools/for_ASTRAL/Astral$ ./ASTRAL.sh 
start at
Tue Sep 10 10:24:33 JST 2024
Tue Sep 10 10:24:34 JST 2024
kosukesano@at138:~/tools/for_ASTRAL/Astral

1秒でできた。

出力ファイルもしっかりある。今後はジョブとして投げなくて良さそう。

kosukesano@at138:~/tools/for_ASTRAL/Astral/240910_result$ ls
out.log  out.tre
kosukesano@at138:~/tools/for_ASTRAL/Astral/240910_result$

出力ファイルout.txtをよく見てみる。

======== Running the main analysis
Number of taxa: 6 (6 species)
Taxa: [Agra, Cass, Dpon, Smad, Sory, Tcas]
Taxon occupancy: {Cass=1950, Sory=1950, Tcas=1518, Agra=1950, Smad=1950, Dpon=1950}
Number of gene trees: 1950
432 trees have missing taxa
Calculating quartet distance matrix (for completion of X)
Species tree distances calculated ...
Will attempt to complete bipartitions from X before adding using a distance matrix.
Building set of clusters (X) from gene trees 

Tcasしか欠失分が取れていない?

系統樹の描画

最終出力であるout.tre240910_ASTRAL.treとしてコピー、それを元に系統樹を描画。

tree4 = read.tree("/Users/kosukesano/bio/240910_ASTRAL.tre")

p=ggtree(tree4)+
  xlim(0, 7)+
  theme(text = element_text(face = "italic"))+
  geom_tiplab(fontface = 4, linesize=3.0) + # Make tip labels italic
  geom_nodelab(hjust = -0.2, node = "internal", size = 5) +
  #geom_text(aes(label=node), hjust=-.2)+
  theme_tree()
p

6種のSCOについて、欠失を1つまで許した時の系統樹

前の系統樹と比べると、マダラとキクイムシの位置がずれている。

out.txtの中にある系統樹もコピーし、240910_ASTRAL_Optimal_tree.treとして描画。

tree5= read.tree("/Users/kosukesano/bio/240910_ASTRAL_Optimal_tree.tre")
p=ggtree(tree5)+
  xlim(0, 7)+
  theme(text = element_text(face = "italic"))+
  geom_tiplab(fontface = 4, linesize=3.0) + # Make tip labels italic
  geom_nodelab(hjust = -0.2, node = "internal", size = 5) +
  #geom_text(aes(label=node), hjust=-.2)+
  theme_tree()
p

SCOの欠失部分がうまく取れていない件について

Tcasで欠失しているOG0010273を例にとって検証

Orthogroup      Agra    Cass    Dpon    Smad    Sory    Tcas    Total
OG0010273       1       1       1       1       1       0       5
OG0010276       1       1       1       1       1       0       5
OG0010278       1       0       1       1       1       1       5
### OG0010273.maffted.fnaの中身

>XP_050293049.1 XP_050293049.1 ras-related protein Rab-27A [Anthonomus grandis grandis]
--MEYDYLIKFLALGDSGVGKTSFLYQYTDSSFNSRFISTVGIDFREKRLIYQAKGRSYR
VHLQLWDTAGQERFRSLTTAFYRDAMGFILIFDLTNEQSFLEIRNWINQLRIHAYCDTPD
IVLCGNKADLEDRRVVSEWKAREFAELNGLPYLETSAATGQNVSRSIETLLERVMIRMET
AVDSAMLPTHRDNFRNPLRVGLNTNYSAQKCSC
>XP_019755005.1 XP_019755005.1 ras-related protein Rab-27A [Dendroctonus ponderosae]
MRMDYDYLIKFLALGDSGVGKTSFLYQYTDGTFNSRFISTVGIDFREKRLIYQSKGRNYR
VHLQLWDTAGQERFRSLTTAFYRDAMGFLLLFDLTNERSFLEIRNWIEQLRVHAYCDTPD
IVLCGNKADIEDRRVVSEWKAREFAEINGLPYLETSAATGQNISRAIETLLEKVMYRMET
AVDMAMLPNRRGNPGDHSQIDLSAPSSAQKCLC
>g2477.t1 g2477.t1
--MEYDYLIKFLALGDSGVGKTSFLHQYTDGTFNSRFISTVGIDFREKRLVYQSKGRNYR
VHLQLWDTAGQERFRSLTTAFYRDAMGFLLLFDLTNEQSFLEIRNWIEQLRVHAYCDTPD
VILCGNKADLEDRRVITEWKAREFAESNGLPYLETSAATGQNVSRAIETLLEKVMYRMET
AVDMAMLPNRRGNLKEVLKVDLNASPSAQKCLC
>XP_030762023.1 XP_030762023.1 ras-related protein Rab-27A [Sitophilus oryzae]
--MDYDYLIKFLALGDSGVGKTSFLYQYTDGTFNSRFISTVGIDFREKRMIYQSKGRNYR
VHLQLWDTAGQERFRSLTTAFYRDAMGFLLLFDLTNEHSFLEIRNWIEQLRLHAYCDTPD
IVLCGNKADLEDRRVVTEWRAREFAEINGLPYLETSAATGQNVSRAVETLLEKVMLRMET
AVDMAMVPGQSGKFKDTGEFMLRSSSPAQKCTC
>XP_967715.1 XP_967715.1 PREDICTED: ras-related protein Rab-27A [Tribolium castaneum]
--MDYDYLIKFLALGDSGVGKTSFLYQYTDGLFNSRFISTVGIDFREKRLLYQSKGRNHR
VHLQLWDTAGQERFRSLTTAFYRDAMGFLLLFDLTNEQSFLEIRNWVEQLRLHAYCDCPD
VVLCGNKADLEDRRIITEWRAREMAEKLGLVYLETSAATGQNVSRAVETLLEKVMIRMET
AVDRAMLPGRRGRPRDPNDVDFNAP-PTHNCTC

Cass以外の5種の配列が記述されている。

### OG0010278.maffted.trimed.edit.faの中身

>Agra
MEYDYLIKFLALGDSGVGKTSFLYQYTDSSFNSRFISTVGIDFREKRLIYQAKGRSYRVHLQLWDTAGQERFRSLTTAFYRDAMGFILIFDLTNEQSFLEIRNWINQLRIHAYCDTPDIVLCGNKADLEDRRVVSEWKAREFAELNGLPYLETSAATGQNVSRSIETLLERVMIRMETAVDSAMLPTHRDNFRNPLRVGLNTNSAQKCSC
>Cass
MDYDYLIKFLALGDSGVGKTSFLYQYTDGTFNSRFISTVGIDFREKRLIYQSKGRNYRVHLQLWDTAGQERFRSLTTAFYRDAMGFLLLFDLTNERSFLEIRNWIEQLRVHAYCDTPDIVLCGNKADIEDRRVVSEWKAREFAEINGLPYLETSAATGQNISRAIETLLEKVMYRMETAVDMAMLPNRRGNPGDHSQIDLSAPSAQKCLC
>Dpon
MEYDYLIKFLALGDSGVGKTSFLHQYTDGTFNSRFISTVGIDFREKRLVYQSKGRNYRVHLQLWDTAGQERFRSLTTAFYRDAMGFLLLFDLTNEQSFLEIRNWIEQLRVHAYCDTPDVILCGNKADLEDRRVITEWKAREFAESNGLPYLETSAATGQNVSRAIETLLEKVMYRMETAVDMAMLPNRRGNLKEVLKVDLNASSAQKCLC
>Smad
MDYDYLIKFLALGDSGVGKTSFLYQYTDGTFNSRFISTVGIDFREKRMIYQSKGRNYRVHLQLWDTAGQERFRSLTTAFYRDAMGFLLLFDLTNEHSFLEIRNWIEQLRLHAYCDTPDIVLCGNKADLEDRRVVTEWRAREFAEINGLPYLETSAATGQNVSRAVETLLEKVMLRMETAVDMAMVPGQSGKFKDTGEFMLRSSPAQKCTC
>Sory
MDYDYLIKFLALGDSGVGKTSFLYQYTDGLFNSRFISTVGIDFREKRLLYQSKGRNHRVHLQLWDTAGQERFRSLTTAFYRDAMGFLLLFDLTNEQSFLEIRNWVEQLRLHAYCDCPDVVLCGNKADLEDRRIITEWRAREMAEKLGLVYLETSAATGQNVSRAVETLLEKVMIRMETAVDRAMLPGRRGRPRDPNDVDFNAPPTHNCTC

Cassが入り、その分Tcasが消えてる。種名ラベルの記述がミスっている?

0911

~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data/下にtestディレクトリを作り、色々検証。

OG_list.txtspecies_list.txtManualphylo_*ファイルをコピーした。

Manualphylo_2.pyの実行

(MPT) kosukesano@at138:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data/test$ python ManualPhylo_2.py ~/tools/for_orthofinder/make_philo_tree/output_directory/all_seq.fa OG_list.txt

Manualphylo_2.pyを実行すると、OG0008034のようなOGごとの配列が記録されたファイルができる。

(MPT) kosukesano@at138:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data/test$ ls
ManualPhylo_1.py  ManualPhylo_3.py  OG0008034  OG0008036  OG0008039  OG0008041  OG0008043  OG0008045  OG0008048  OG_list.txt  species_list.txt
ManualPhylo_2.py  OG0008033         OG0008035  OG0008037  OG0008040  OG0008042  OG0008044  OG0008046  OG0008049  align.sh
(MPT) kosukesano@at138:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data/test$ 

上記のは途中でCtr+Cを押して終了させた。

OG0010273のみをコピーし、残りは消去。

align_shの実行

kosukesano@at138:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data/test$ source ~/tools/pyenv_env/ManualPhilo_profile
(MPT) kosukesano@at138:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data/test$ sh align.sh OG_list.txt

align_shを実行すると、*maffted.fa*maffted.trimed.faができる。*maffted.faはヘッダーがフルで入っているが、*maffted.trimed.faはヘッダー部分は遺伝子IDのみになっている。

### OG0010273.maffted.faの中身

>XP_050292831.1 XP_050292831.1 uncharacterized protein LOC126733540 [Anthonomus grandis grandis]
MPFNVMIQLCNITRQSKLFQRNFYLPLLKPLSPKDALRIHVITGKDVLSKALEHWIPVLE
EYSARVARKRHMPKGKGKKRLRKKESMKYLMPFFDSGMDNHAKLENVCNRKSKGRGQCYT
YYIPEKKCTLVFTHQMERAIRDKDIIDVIIAQRHEKILVKMRDGTKVVMPLSPYNGKGPL
YRGEGWTRKDIEEYHHHGRETFSIAQLFEAKEAGIEEWELEMMRMANKRKKKMKGEDTRD
WKEMLAATASNMDWDAFEEETKQIVEQANEPVNDEPIAMQLDNIDKTCDVIDKLKAAGEE
VLSLLPTMPEIPEILKVIKDQSELTEIANISGARVNLKASAERPGERFVPGQMVTSEEGD
LFVPGQTILNESGVKEYTPGFTVLLDNEPTLIPGLVMGNDPDKSMFLPGESTITESGELQ
FAATDEDILPHVDTPPP-EKEVEEIELEEEQNSEDEEIEQRPPPKPKRKELTYERPKREF
KTENMGPKRRVRGPKKVAEPAPAPAPPTLE--RRPTIIIEAKLFNLQTPTFEKDILEQQK
ERVEAFKEKTGKEEARLNKYRLELRMKAKKMRESLPPPPIYEPLEPVRKSEKLRELEKSI
KKGRFFEADHKKYITNEYTE-KFHWIDTYQYKKVFDTVGIMRHRVWKPVYS--H
>CAG9762300.1 CAG9762300.1 unnamed protein product [Ceutorhynchus assimilis]
MPFNVMIQLCNITRQS----------------------------KDVLAKALEHWIPVLE
DHASRVARRRHMPKGKGKKRQRKKESMRYLTPFFDEMMDNHGRLENMIIRKPKGRGQCYT
YYIPEKKCTLVLTHQFEKAIRDKDIVDIVIAQRHEKIIVKLRDGLKVVMPLSPYSGKAPF
YRGEGWTKKDIEEFHHHGRETFSIAQLFEAKEAGIEEWELEMMRMANKRKKKMKGEDTRD
WKEMLAATTENMDWDAFEEESRQIVEEANEPVEDDPIAMQLNDIEKTCDVTEKLKAAGED
VMSLLPKMPEIPEILKIFKDHSELTEIANVSGARVNLATGT----HRFVPGQMVTSEDGE
IFVPGQTVLTESGEREYTPGFTVLLEDEPTLIPGLVMGNDPEKTMFLPGEATITESGELQ
FGVNEDDIVP--SLTPPFEKEVEEIELEEEQNSEEEEIEQRPPPK--KKELTYERPKREF
NQEKMGPKHRVRGPKKLPPVVSPPEKDLAD--RRKTIVPDTKLFDLTTQTFEKDFLEQEK
ERVEAFKEKTGKEEAKVDKQRREIKLMVKKMRDSLPPKPKYQPLEPVRKSEKLRDMEKSI
KKGKFFEVDYKKWLTKENNHEPFHWMDTYQYKKTFDSVGIIRHRIWKSVY----
>XP_019758826.1 XP_019758826.1 uncharacterized protein LOC109536854 [Dendroctonus ponderosae]
MPFNVMIQLCNITRQSKLFQRNFFLPLLKPLSPKDALRIHIITGKDVLVKAIEHWIPVLE
EYACRVQRRRHMPKGKGKKRQRKKESMKYVMPFFDDMMENHPKLENLIIRKPKGRGQCYT
YYIPEKKCTLVLTYQMEKAIRDKDIVDIVIAQRHEKILVKMRDGTKVVMPLCPYNGHAPF
YRGEGWTRKDIEEYHHHGRETFSIAQLFEAKEAGIEEWELEMMRMANKRKKKMKGEDSRD
WKDMLNSTVNNMDWDAFEEESKQIVAESNDPVDDEPISMQLDDIEKTCDVNEKLKAAGDE
IMSLLPTMPEIPQILKVMKDQSEFTEIANISGARVSLSAGS----DRFVPGQMVTSEDGE
LFVPGQTVVNESGDKEYTPGFTVLLDNEPTLIPGLVMGNDPDKSMFLPGESTITEAGELQ
FAATEDDIIPHPPTPPPEEKEVEEVELEEDQNSEEEEVEQRPPPKRERKELTYERPKREF
NTENMGPKHRVRGPKKVAPVV-IKTEETPDPVRRKT-IIDAKIFDLQTPTFEKDFLEQEK
ERVEAFKEKSGKEEAKVDKQRREIKLKVKKLVDSRPPPPKYEPLEPVRKSEKLREFEKSI
KRGKFFDVDYKKYLTKEYTG-QFHWLDTYQYRNTFDTVGIMRHRIWKSVY----
>g9568.t1 g9568.t1
MPFNVMIQLCSITRQSKLFQRNFYLPLLKPLSPKDALRIHIITGKDVLAKALEHWIPVLE
EYASRVARRRHMPKGRGKKRQRKKDSMKYVCPFFDDNMDNHGRLENMVNRKSKGRGQCYT
YYIPEKKCTLVVTHQMERAIRDKDIVDIVIAQRQEKILVKMKDGTKVVMPLCPYNGRNPL
YRGEGWTKKEIEEYHHHGRDTFSIAQLFEAKEAGIEEWELEMMRMANKRKKKIKGEDTKD
WKEMLSTTMQNMDWDAFEEESKQIVEEANEPVDDEPIPMQLNDMDKTCDVIEKLKAAGDD
VLKLLPVMPEIPEILKVIQDQSEFTEIASISGARVSLTSGS----ERFVPGQMVTSEEGE
LFVPGQTTVSESGEKEYTPGFTVLLDNEPTLIPGLVMGNDPEKSMFLPGESTITESGELQ
FAATEDDILPYQPAPPSEEKEVEEVELEEEQNSEEEEIEQRPPPKREKKEFTYERPKREF
NPESMGPKHRVRGPKKVPPMVQAPAEPTPDPARRKT-VVEVKIFDLQTPTFEKDFLEQEK
ERVEAFKEKSGKEEAKVDKQRREIKMMAKKIIDSSPRVVKYEPLEPVKKSEKLREFEKSI
KKGNFFDVDYKKWLSRNHKE-QFHWADTYQYRNTFDTVGIMRHRVWKSVYSSRK
>XP_030763397.1 XP_030763397.1 uncharacterized protein LOC115887965 [Sitophilus oryzae]
MPFNVMIQLCRVTRASKLFQRNFYLPLLKPLSPKDALRRHIITGKDVLQKALEHWIPVLE
EYAARVQRRRHAPKGRGKKRQRRKESMKYVMPFFDDTLPSHPKLENLVARKSKGRGQCYT
YHIPEKKCTLVLTHQMERAIRDKDIVDIVIAQRHEKIIVKMRDGTKVTMPLCPYEGRAPL
YRGEGWTRKDIEEFHHHGHETFSIAQLFEAKEAGIEEWELEMMRLASQRKKKMKGEGTQD
WKAMLQTTVENMDWEEFEEDAKQIVTEVNEVVEDEPIAMQVDDMELTCDVNEKLKAAGAD
VLALLPSMPEIPQLLRLLSGQSELTQVAKVSGARVSLDAGS----DRFVPGQLVASEEGE
LFVPGQTVLTEAGEKEYTPGFTVMMDGEPTLIPGLVMGNDPNKAMFLPGESTITGGGELQ
FAASADDVLVNEPLPPP-VEEPEEAELDEDQNSVEEEIEMRPPPKRERKEFVYERPKRQY
DVESMGPKHRERGPKRLPAALQAAANEPPP--APKP-FVPVKMIEFTPPVFEKDLLEQEK
ERVAAMKEKTGKEEAKVDKTRREIRMRAKNLMDSRPPPPKYEPLEPVRKSEKLREMERSI
KQGAFFDTDYKKYLVRERNSWPVNWLEKYQYRNTFDTVGIQRHRVWKSVF----
### OG0010273.maffted.trimed.faの中身

>XP_050292831.1
MPFNVMIQLCNITRQSKLFQRNFYLPLLKPLSPKDALRIHVITGKDVLSKALEHWIPVLE
EYSARVARKRHMPKGKGKKRLRKKESMKYLMPFFDSGMDNHAKLENVCNRKSKGRGQCYT
YYIPEKKCTLVFTHQMERAIRDKDIIDVIIAQRHEKILVKMRDGTKVVMPLSPYNGKGPL
YRGEGWTRKDIEEYHHHGRETFSIAQLFEAKEAGIEEWELEMMRMANKRKKKMKGEDTRD
WKEMLAATASNMDWDAFEEETKQIVEQANEPVNDEPIAMQLDNIDKTCDVIDKLKAAGEE
VLSLLPTMPEIPEILKVIKDQSELTEIANISGARVNLKASAERFVPGQMVTSEEGDLFVP
GQTILNESGVKEYTPGFTVLLDNEPTLIPGLVMGNDPDKSMFLPGESTITESGELQFAAT
DEDILPHVDTPPPEKEVEEIELEEEQNSEDEEIEQRPPPKPKRKELTYERPKREFKTENM
GPKRRVRGPKKVAEPAPAPAPPTLERRPTIIEAKLFNLQTPTFEKDILEQQKERVEAFKE
KTGKEEARLNKYRLELRMKAKKMRESLPPPPIYEPLEPVRKSEKLRELEKSIKKGRFFEA
DHKKYITNEYTEKFHWIDTYQYKKVFDTVGIMRHRVWKPVY
>CAG9762300.1
MPFNVMIQLCNITRQS----------------------------KDVLAKALEHWIPVLE
DHASRVARRRHMPKGKGKKRQRKKESMRYLTPFFDEMMDNHGRLENMIIRKPKGRGQCYT
YYIPEKKCTLVLTHQFEKAIRDKDIVDIVIAQRHEKIIVKLRDGLKVVMPLSPYSGKAPF
YRGEGWTKKDIEEFHHHGRETFSIAQLFEAKEAGIEEWELEMMRMANKRKKKMKGEDTRD
WKEMLAATTENMDWDAFEEESRQIVEEANEPVEDDPIAMQLNDIEKTCDVTEKLKAAGED
VMSLLPKMPEIPEILKIFKDHSELTEIANVSGARVNLATGTHRFVPGQMVTSEDGEIFVP
GQTVLTESGEREYTPGFTVLLEDEPTLIPGLVMGNDPEKTMFLPGEATITESGELQFGVN
EDDIVP--SLTPPEKEVEEIELEEEQNSEEEEIEQRPPPK--KKELTYERPKREFNQEKM
GPKHRVRGPKKLPPVVSPPEKDLADRRKTVPDTKLFDLTTQTFEKDFLEQEKERVEAFKE
KTGKEEAKVDKQRREIKLMVKKMRDSLPPKPKYQPLEPVRKSEKLRDMEKSIKKGKFFEV
DYKKWLTKENNHPFHWMDTYQYKKTFDSVGIIRHRIWKSVY
>XP_019758826.1
MPFNVMIQLCNITRQSKLFQRNFFLPLLKPLSPKDALRIHIITGKDVLVKAIEHWIPVLE
EYACRVQRRRHMPKGKGKKRQRKKESMKYVMPFFDDMMENHPKLENLIIRKPKGRGQCYT
YYIPEKKCTLVLTYQMEKAIRDKDIVDIVIAQRHEKILVKMRDGTKVVMPLCPYNGHAPF
YRGEGWTRKDIEEYHHHGRETFSIAQLFEAKEAGIEEWELEMMRMANKRKKKMKGEDSRD
WKDMLNSTVNNMDWDAFEEESKQIVAESNDPVDDEPISMQLDDIEKTCDVNEKLKAAGDE
IMSLLPTMPEIPQILKVMKDQSEFTEIANISGARVSLSAGSDRFVPGQMVTSEDGELFVP
GQTVVNESGDKEYTPGFTVLLDNEPTLIPGLVMGNDPDKSMFLPGESTITEAGELQFAAT
EDDIIPHPPTPPPEKEVEEVELEEDQNSEEEEVEQRPPPKRERKELTYERPKREFNTENM
GPKHRVRGPKKVAPVV-IKTEETPDRRKTIIDAKIFDLQTPTFEKDFLEQEKERVEAFKE
KSGKEEAKVDKQRREIKLKVKKLVDSRPPPPKYEPLEPVRKSEKLREFEKSIKRGKFFDV
DYKKYLTKEYTGQFHWLDTYQYRNTFDTVGIMRHRIWKSVY
>g9568.t1
MPFNVMIQLCSITRQSKLFQRNFYLPLLKPLSPKDALRIHIITGKDVLAKALEHWIPVLE
EYASRVARRRHMPKGRGKKRQRKKDSMKYVCPFFDDNMDNHGRLENMVNRKSKGRGQCYT
YYIPEKKCTLVVTHQMERAIRDKDIVDIVIAQRQEKILVKMKDGTKVVMPLCPYNGRNPL
YRGEGWTKKEIEEYHHHGRDTFSIAQLFEAKEAGIEEWELEMMRMANKRKKKIKGEDTKD
WKEMLSTTMQNMDWDAFEEESKQIVEEANEPVDDEPIPMQLNDMDKTCDVIEKLKAAGDD
VLKLLPVMPEIPEILKVIQDQSEFTEIASISGARVSLTSGSERFVPGQMVTSEEGELFVP
GQTTVSESGEKEYTPGFTVLLDNEPTLIPGLVMGNDPEKSMFLPGESTITESGELQFAAT
EDDILPYQPAPPSEKEVEEVELEEEQNSEEEEIEQRPPPKREKKEFTYERPKREFNPESM
GPKHRVRGPKKVPPMVQAPAEPTPDRRKTVVEVKIFDLQTPTFEKDFLEQEKERVEAFKE
KSGKEEAKVDKQRREIKMMAKKIIDSSPRVVKYEPLEPVKKSEKLREFEKSIKKGNFFDV
DYKKWLSRNHKEQFHWADTYQYRNTFDTVGIMRHRVWKSVY
>XP_030763397.1
MPFNVMIQLCRVTRASKLFQRNFYLPLLKPLSPKDALRRHIITGKDVLQKALEHWIPVLE
EYAARVQRRRHAPKGRGKKRQRRKESMKYVMPFFDDTLPSHPKLENLVARKSKGRGQCYT
YHIPEKKCTLVLTHQMERAIRDKDIVDIVIAQRHEKIIVKMRDGTKVTMPLCPYEGRAPL
YRGEGWTRKDIEEFHHHGHETFSIAQLFEAKEAGIEEWELEMMRLASQRKKKMKGEGTQD
WKAMLQTTVENMDWEEFEEDAKQIVTEVNEVVEDEPIAMQVDDMELTCDVNEKLKAAGAD
VLALLPSMPEIPQLLRLLSGQSELTQVAKVSGARVSLDAGSDRFVPGQLVASEEGELFVP
GQTVLTEAGEKEYTPGFTVMMDGEPTLIPGLVMGNDPNKAMFLPGESTITGGGELQFAAS
ADDVLVNEPLPPPVEEPEEAELDEDQNSVEEEIEMRPPPKRERKEFVYERPKRQYDVESM
GPKHRERGPKRLPAALQAAANEPPPAPKPFVPVKMIEFTPPVFEKDLLEQEKERVAAMKE
KTGKEEAKVDKTRREIRMRAKNLMDSRPPPPKYEPLEPVRKSEKLREMERSIKQGAFFDT
DYKKYLVRERNSPVNWLEKYQYRNTFDTVGIQRHRVWKSVF

Manualphylo_3.pyの実行

(MPT) kosukesano@at138:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data/test$ python3 ManualPhylo_3.py OG_list2.txt species_list.txt

Manualphylo_3.pyを実行すると、*.maffted.trimed.edit.faができる。

対策

  • *.maffted.trimed.faのヘッダー行を遺伝子IDにせず、元のヘッダーを保持させる
    • align.shのコード内で-keepheaderオプションを追加し、ヘッダー行を保持させる。これによりヘッダー行に種名の情報を残す。
  • Manualphylo_3.pyspecies_list.txtに頼らないよう書き替え
    • Manualphylo_3.pyで行っているヘッダー行の書き換えについて、種名を元のヘッダーから取得するように変更。

変更後のalign.shはこちら

#!/bin/sh
#$ -S /bin/bash
#$ -cwd
#$ -v PATH
awk '{print($1)}' $1 | while read x; do #引数に前述のOG_list.txtなどのOGリストを指定する。
    mafft --auto $x > $x.maffted.fa
    trimal -in $x.maffted.fa -out $x.maffted.trimed.fa -keepheader -htmlout $x.maffted.trimed.fa.html -automated1
done

新規スクリプトNew_Manualphylo_3.shはこちら

import os
from Bio import SeqIO

# 処理するディレクトリのパス
input_dir = '~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data/'
input_dir = os.path.expanduser(input_dir)

# ディレクトリ内のファイルをリスト
files = [f for f in os.listdir(input_dir) if f.endswith('.maffted.trimed.fa')]

# 各ファイルに対して処理を適用
for file in files:
    input_file = os.path.join(input_dir, file)
    
    # 出力ファイルのパス
    og_number = file.split('.')[0]  # OG番号を取得
    output_file = os.path.join(input_dir, f"{og_number}.maffted.trimed.edit.fa")
    
    # ファイルの読み込みと書き換え
    with open(output_file, 'w') as outfile:
        for record in SeqIO.parse(input_file, 'fasta'):
            header = record.description
            seq = str(record.seq)

            # ヘッダーが「>g」で始まる場合、「>Smad」に置き換え
            if header.startswith("g"):
                new_header = ">Smad"

            # ヘッダーが「]」で終わる場合、指定の形式に変換
            elif header.endswith("]"):
                # 「[]」内の最初の1文字とスペース後の3文字を抽出
                within_brackets = header.split('[')[1].split(']')[0]
                first_letter = within_brackets[0]
                space_after = within_brackets.split()[-1][:3]
                new_header = f">{first_letter}{space_after}"

            else:
                new_header = header

            # 新しいヘッダーと配列を出力ファイルに書き込む
            outfile.write(f"{new_header}\n{seq}\n")

        # ジョブの進行状況を出力
        print(f"Processed: {file}")

print("全てのファイルが処理されました。")

これを行って*maffted.trimed.edit.faを新しく作り直したのち、makealltree.shを改めてジョブとして投げた。

Checkpoint (OG0008033.ckp.gz) indicates that a previous run successfully finished
Use -redo option if you really want to redo the analysis and overwrite all output files.
Use --redo-tree option if you want to restore ModelFinder and only redo tree search.
Use --undo option if you want to continue previous run when changing/adding options.

ジョブを投げたら「それもうすでにやってるけど上書きしていい?」って警告が出た。上書きしていいように-redoのオプションをつけて再度実行。

0912

ショウジョウバエを含めたASTRAL出力の系統樹の描画

そういえばOrthofinderのみでこっちはしてなかったのでここに記載

tree6 = read.tree("/Users/kosukesano/bio/240912_ASTRAL.tre")

p=ggtree(tree6)+
  xlim(0, 7)+
  theme(text = element_text(face = "italic"))+
  geom_tiplab(fontface = 4, linesize=3.0) + # Make tip labels italic
  geom_nodelab(hjust = -0.2, node = "internal", size = 5) +
  #geom_text(aes(label=node), hjust=-.2)+
  theme_tree()
p

tree7= read.tree("/Users/kosukesano/bio/240912_ASTRAL_Optimal_tree.tre")
p=ggtree(tree7)+
  xlim(0, 7)+
  theme(text = element_text(face = "italic"))+
  geom_tiplab(fontface = 4, linesize=3.0) + # Make tip labels italic
  geom_nodelab(hjust = -0.2, node = "internal", size = 5) +
  #geom_text(aes(label=node), hjust=-.2)+
  theme_tree()
p

ショウジョウバエがあり得ない位置にいる!?

OG番号から配列を取ってくる際に種名をつけるところをミスっているのでは?昨日のNew_Manualphylo_3.pyをこれにも適用してもう一度やってみる。

### align.shの中身

#!/bin/sh
#$ -S /bin/bash
#$ -cwd
#$ -v PATH
awk '{print($1)}' $1 | while read x; do #引数に前述のOG_list.txtなどのOGリストを指定する。
    mafft --auto $x > $x.maffted.fa
    trimal -in $x.maffted.fa -out $x.maffted.trimed.fa -keepheader -htmlout $x.maffted.trimed.fa.html -automated1
done
###New_Maunalphylo_3.pyの中身

import os
from Bio import SeqIO

# 処理するディレクトリのパス
input_dir = '~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03/ManualPhylo_data'
input_dir = os.path.expanduser(input_dir)

# ディレクトリ内のファイルをリスト
files = [f for f in os.listdir(input_dir) if f.endswith('.maffted.trimed.fa')]

# 各ファイルに対して処理を適用
for file in files:
    input_file = os.path.join(input_dir, file)
    
    # 出力ファイルのパス
    og_number = file.split('.')[0]  # OG番号を取得
    output_file = os.path.join(input_dir, f"{og_number}.maffted.trimed.edit.fa")
    
    # ファイルの読み込みと書き換え
    with open(output_file, 'w') as outfile:
        for record in SeqIO.parse(input_file, 'fasta'):
            header = record.description
            seq = str(record.seq)

            # ヘッダーが「>g」で始まる場合、「>Smad」に置き換え
            if header.startswith("g"):
                new_header = ">Smad"

            # ヘッダーが「]」で終わる場合、指定の形式に変換
            elif header.endswith("]"):
                # 「[]」内の最初の1文字とスペース後の3文字を抽出
                within_brackets = header.split('[')[1].split(']')[0]
                first_letter = within_brackets[0]
                space_after = within_brackets.split()[-1][:3]
                new_header = f">{first_letter}{space_after}"

            else:
                new_header = header

            # 新しいヘッダーと配列を出力ファイルに書き込む
            outfile.write(f"{new_header}\n{seq}\n")

        # ジョブの進行状況を出力
        print(f"Processed: {file}")

print("全てのファイルが処理されました。")

これをどれも実行した上で、改めてmakealltree.shqsubで投げた。

0917

New_Manualphylo_3.pyを使用した昆虫6種のASTRAL

~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data下でNew_Manualphylo_3.pyを実行し、その上でIQ-TREEにかけた結果が出てきた。

この最終出力であるall_trees.nwk~/tools/for_ASTRAL/Astral/data/new_6sp_withOneZero/にコピー。

注意!

元のヘッダーで種名以外の部分に[]が使われており、uubiという種が新たに創造されていた。2つ3つだったのでnanoで手動編集し修正した。

このファイルをmodify.pyにかけ、OG番号を削除。ASTRALにかける。

出力ファイルは~/tools/for_ASTRAL/Astral/240917_6sp_withOneZero_resultに格納。その中身は以下の通り。

### ~/tools/for_ASTRAL/Astral/240917_6sp_withOneZero_result/out.logの中身の一部

======== Running the main analysis
Number of taxa: 6 (6 species)
Taxa: [Agra, Cass, Dpon, Smad, Sory, Tcas]
Taxon occupancy: {Cass=1837, Sory=1902, Tcas=1856, Agra=1921, Smad=1843, Dpon=1909}
Number of gene trees: 1950
432 trees have missing taxa
Calculating quartet distance matrix (for completion of X)
Species tree distances calculated ...
Will attempt to complete bipartitions from X before adding using a distance matrix.
Building set of clusters (X) from gene trees 

各種に欠損がちびちび入ってるから無事取れてそう?

これにより描かれた系統樹は以下の通り。

tree8= read.tree("/Users/kosukesano/bio/240917_6sp_withOneZero_ASTRAL.tre")
p=ggtree(tree8)+
  xlim(0, 7)+
  theme(text = element_text(face = "italic"))+
  geom_tiplab(fontface = 4, linesize=3.0) + # Make tip labels italic
  geom_nodelab(hjust = -0.2, node = "internal", size = 5) +
  #geom_text(aes(label=node), hjust=-.2)+
  theme_tree()
p=flip(p, 2, 3)
p=flip(p, 1, 8)
p

Agraが変なとこにいるのは変わらない.

New_Manualphylo_3.pyを使用した昆虫7種のASTRAL

ショウジョウバエも入れた時の結果。

all_trees.nwk~/tools/for_ASTRAL/Astral/data/new_7sp/にコピー。

注意!

これも同様に元のヘッダーで種名以外の部分に[]が使われており、uubiという種が新たに創造されていた。また__という種もあった。2つ3つだったのでnanoで手動編集し修正した。

このファイルをmodify.pyにかけ、OG番号を削除。ASTRALにかける。出力ファイルは~/tools/for_ASTRAL/Astral/240917_7sp_resultに格納。その中身は以下の通り。

### ~/tools/for_ASTRAL/Astral/240917_7sp_result/out.logの中身の一部

======== Running the main analysis
Number of taxa: 7 (7 species)
Taxa: [Agra, Cass, Dpon, Dmel, Tcas, Sory, Smad]
Taxon occupancy: {Dmel=630, Cass=630, Sory=630, Tcas=630, Agra=630, Smad=630, Dpon=630}
Number of gene trees: 630
0 trees have missing taxa
Calculating quartet distance matrix (for completion of X)
Species tree distances calculated ...
Building set of clusters (X) from gene trees 
------------------------------

これにより描かれた系統樹は以下の通り。

tree9= read.tree("/Users/kosukesano/bio/240917_7sp_ASTRAL.tre")
p=ggtree(tree9)+
  xlim(0, 7)+
  theme(text = element_text(face = "italic"))+
  geom_tiplab(fontface = 4, linesize=3.0) + # Make tip labels italic
  geom_nodelab(hjust = -0.2, node = "internal", size = 5) +
  #geom_text(aes(label=node), hjust=-.2)+
  theme_tree()
p=flip(p, 2, 11)
p=flip(p, 1, 9)
p

SmadCassDponのクレード関係は先行研究と違ってるね。Agraはそもそも元データがダメか?

6種の昆虫のミトコンドリアCO1遺伝子での系統樹推定

~/tools/for_orthofinder/CO1_6sp/dataディレクトリを作成。

その下でSmad.fastaを作成した。参照元はこちら

Cass.fasta参照元はこちら

Dpon.fasta参照元はこちら

Agra.fasta参照元はこちら

Sory.fasta参照元はこちら

Tcas.fasta参照元はこちら

これを1つにまとめたファイル、COI.fasta~/tools/for_orthofinder/CO1_6spに作成した。

これについて、align.shを実行した。align.shのスクリプトは以下の通り。

### align.shの中身

#!/bin/sh
#$ -S /bin/bash
#$ -cwd
#$ -v PATH

# COI.fasta に対してMAFFTとTrimAlを実行
mafft --auto COI.fasta > COI.maffted.fa
trimal -in COI.maffted.fa -out COI.maffted.trimed.fa -htmlout COI.maffted.trimed.fa.html -automated1

さらに、これについてIQ-TREEで遺伝子系統樹を描いた。実行スクリプトIQ_TREE.shは以下の通り。

### IQ_TREE.shの中身

#!/bin/bash
#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 16
echo start at
date

# Singularityイメージのパスを指定
SINGULARITY_IMAGE="/usr/local/biotools/i/iqtree:2.3.3--h21ec9f0_0"

# 処理するファイルを指定
file="COI.maffted.trimed.fa"

# ファイル名から拡張子を除いたベース名を取得
base_name=$(basename $file .maffted.trimed.fa)

# Singularityを使用してIQ-TREEを実行して系統樹を作成
singularity exec -e $SINGULARITY_IMAGE iqtree2 -s $file -nt AUTO -bb 1000 -cptime 600 -pre ${base_name}

# 作成された系統樹ファイル (.treefile) を確認
if [ -f ${base_name}.treefile ]; then
    echo "Tree for ${base_name} has been successfully created."
else
    echo "Error: ${base_name}.treefile not found" >&2
fi

echo "Process completed."

date

これをqsubで投げた。

元データのヘッダーを書き換えた状態での系統樹推定

~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dirディレクトリから.fastaファイルを取得し、「種名」+「遺伝子ID」のみに書き換えて~/tools/for_orthofinder/RemakeHedder_6spディレクトリに保存するスクリプトedit.py~/tools/for_orthofinder/RemakeHedder_6spの下で作成。

### edit.pyの中身

import os
from Bio import SeqIO

# 入力ディレクトリと出力ディレクトリのパス
input_dir = '../Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/'
output_dir = '../RemakeHedder_6sp/'

# 出力ディレクトリが存在しない場合は作成
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# 入力ディレクトリ内のすべての .fasta ファイルを処理
for input_file in os.listdir(input_dir):
    if input_file.endswith('.fasta'):
        input_path = os.path.join(input_dir, input_file)
        output_path = os.path.join(output_dir, input_file)

        # 入力ファイルを読み込み、条件に基づいて書き換えた内容を出力ファイルに保存
        with open(output_path, 'w') as outfile:
            for record in SeqIO.parse(input_path, 'fasta'):
                header = record.description
                seq = str(record.seq)

                # ヘッダーが「g」で始まる場合
                if header.startswith("g"):
                    # 新しいヘッダーは「>Smad」 + 「元のヘッダーの番号」
                    number = header.split()[0]  # ヘッダーの最初の番号部分を取得
                    new_header = f">Smad_{number}"

                # ヘッダーが「]」で終わる場合
                elif header.endswith("]"):
                    # ヘッダーの最後の「[]」内の英字を抽出
                    within_brackets = header.split('[')[-1].split(']')[0]
                    first_letter = within_brackets[0]  # 最初の1文字
                    space_after = within_brackets.split()[-1][:3]  # スペース後の3文字
                    
                    # 元のヘッダーから最初の「>」の次の文字から最初の「 」までの部分を取得
                    first_part = header.split()[0][1:]
                    new_header = f">{first_letter}{space_after}_{first_part}"

                else:
                    new_header = f">{header.split()[0]}"

                # 新しいヘッダーと配列を出力ファイルに書き込む
                outfile.write(f"{new_header}\n{seq}\n")

        print(f"{output_path} に保存しました。")

これを実行。

(MPT) kosukesano@at137:~/tools/for_orthofinder/RemakeHedder_6sp$ python edit.py 
../RemakeHedder_6sp/Tcas.fasta に保存しました。
../RemakeHedder_6sp/Agra.fasta に保存しました。
../RemakeHedder_6sp/Smad.fasta に保存しました。
../RemakeHedder_6sp/Cass.fasta に保存しました。
../RemakeHedder_6sp/Dpon.fasta に保存しました。
../RemakeHedder_6sp/Sory.fasta に保存しました。
(MPT) kosukesano@at137:~/tools/for_orthofinder/RemakeHedder_6sp$ ls
Agra.fasta  Cass.fasta  Dpon.fasta  Smad.fasta  Sory.fasta  Tcas.fasta  edit.py

~/tools/for_orthofinderOrthofinderの実行スクリプトOrthofinder_240917_RH.shを記述。

### Orthofinder_240917_RH.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -pe def_slot 5
#$ -l medium
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date


singularity exec /usr/local/biotools/o/orthofinder:2.5.4--hdfd78af_0 orthofinder -f ~/tools/for_orthofinder/RemakeHedder_6sp -t 5 -a 5

date

これをqsubで投げた。

Pissodes strobiのソフトマスク

参照元はこちら

生データは~/tools/for_softmask/nama_data/Pstr_dataに格納した。

(MPT) kosukesano@at137:~/tools/for_softmask/nama_data/Pstr_data$ ls
GCA_016904865.1  assembly_data_report.jsonl  data_summary.tsv  dataset_catalog.json
(MPT) kosukesano@at137:~/tools/for_softmask/nama_data/Pstr_data$ 

ソフトマスク用のディレクトリ~/tools/for_softmask/Pstr_softmaskを作成。

(MPT) kosukesano@at137:~/tools/for_softmask$ mkdir Pstr_softmask
(MPT) kosukesano@at137:~/tools/for_softmask$ cd Pstr_softmask/
(MPT) kosukesano@at137:~/tools/for_softmask/Pstr_softmask$ ls
(MPT) kosukesano@at137:~/tools/for_softmask/Pstr_softmask$ source ~/tools/pyenv_env/EDTA_profile

データベースの構築

(MPT) kosukesano@at137:~/tools/for_softmask/Pstr_softmask$ source ~/tools/pyenv_env/EDTA_profile

(EDTA2) kosukesano@at137:~/tools/for_softmask/Pstr_softmask$ BuildDatabase -name Pstr_BLAST_DATABASE_PREFIX /home/kosukesano/tools/for_softmask/nama_data/Pstr_data/GCA_016904865.1/GCA_016904865.1_GSC_weevil_1.0_genomic.fna
Building database Pstr_BLAST_DATABASE_PREFIX:
  Reading /home/kosukesano/tools/for_softmask/nama_data/Pstr_data/GCA_016904865.1/GCA_016904865.1_GSC_weevil_1.0_genomic.fna...
Number of sequences (bp) added to database: 84140 ( 2025024129 bp )
(EDTA2) kosukesano@at137:~/tools/for_softmask/Pstr_softmask$ 

RepeatModelerの実行

Pstr_RepeatModeler.shを作成し、qsubで投げた。シェルスクリプトの中身は以下の通り。

#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 24
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date

source ~/tools/pyenv_env/EDTA_profile

RepeatModeler -database Pstr_BLAST_DATABASE_PREFIX  -pa 6
date

#### Elaeidobius kamerunicusのソフトマスク

参照元はこちら

生データは~/tools/for_softmask/nama_data/Ekam_dataに格納した。

ソフトマスク用のディレクトリ~/tools/for_softmask/Ekam_softmaskを作成。

データベースの構築

(EDTA2) kosukesano@at137:~/tools/for_softmask/Ekam_softmask$ BuildDatabase -name Ekam_BLAST_DATABASE_PREFIX /home/kosukesano/tools/for_softmask/nama_data/Ekam_data/GCA_014849505.1/GCA_014849505.1_AAL_Ekam_1.0_genomic.fna
Building database Ekam_BLAST_DATABASE_PREFIX:
  Reading /home/kosukesano/tools/for_softmask/nama_data/Ekam_data/GCA_014849505.1/GCA_014849505.1_AAL_Ekam_1.0_genomic.fna...
Number of sequences (bp) added to database: 364527 ( 269635327 bp )
(EDTA2) kosukesano@at137:~/tools/for_softmask/Ekam_softmask$ 

RepeatModelerの実行

Ekam_RepeatModeler.shを作成し、qsubで投げた。シェルスクリプトの中身は以下の通り。

#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 24
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date

source ~/tools/pyenv_env/EDTA_profile

RepeatModeler -database Ekam_BLAST_DATABASE_PREFIX  -pa 6
date

0918

6種の昆虫ゲノムについて、all_seq.faのヘッダー行を書き換えて再度系統樹作成

OG番号から種ごとに配列を取ってくる際、何かやらかしているのでは?そこのケア。

~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/RenameHedderというディレクトリを作り、そこで作業を行う。

まず新しいOG_list.txtを作る。前のOG_list.txtは種名がわからなかったので種名もつけるようにする。それを実行するスクリプトはmakeOGlist.py

### makeOGlist.pyの中身

# ファイルを読み込む
input_file = '../ManualPhylo_data/OG_list.txt'
output_file = '../RenameHedder/New_OG_list.txt'

# 種の接頭辞をリストで定義
prefixes = ['Agra_', 'Cass_', 'Dpon_', 'Smad_', 'Sory_', 'Tcas_']

# 出力ファイルに書き込み
with open(input_file, 'r') as fin, open(output_file, 'w') as fout:
    for line in fin:
        parts = line.strip().split()  # 各行をスペースで区切る
        new_line = parts[0] + ' ' + ' '.join([f'{prefixes[i]}{parts[i+1]}' for i in range(6)])  # プレフィックスを追加
        fout.write(new_line + '\n')  # 新しい行をファイルに書き込む

最終出力はNew_OG_list.txt

次に新しいallseq.faを作る。こちらはヘッダーにいらないアノテーションの説明などが入っているので、遺伝子IDと種名のみにする。その実行スクリプトedit_allseq.pyは以下の通り。

### edit_allseq.pyの中身

  GNU nano 6.2                                                                         edit_allseq.py                                                                                  
from Bio import SeqIO

# 入力ファイルと出力ファイルのパス
input_file = '../../../../make_philo_tree/output_directory/all_seq.fa'
output_file = 'new_all_seq.fa'

# 入力ファイルを読み込み、条件に基づいて書き換えた内容を出力ファイルに保存
with open(output_file, 'w') as outfile:
    for record in SeqIO.parse(input_file, 'fasta'):
        header = record.description
        seq = str(record.seq)

        # ヘッダーが「>g」で始まる場合
        if header.startswith("g"):
            # 新しいヘッダーは「>Smad」 + 「元のヘッダーの番号」
            number = header.split()[0]  # ヘッダーの最初の番号部分を取得
            new_header = f">Smad_{number}"

        # ヘッダーが「]」で終わる場合
        elif header.endswith("]"):
            # ヘッダーの最後の「[]」内の英字を抽出
            within_brackets = header.split('[')[-1].split(']')[0]
            first_letter = within_brackets[0]  # 最初の1文字
            space_after = within_brackets.split()[-1][:3]  # スペース後の3文字

            # 元のヘッダーから最初の「>」の次の文字から最初の「 」までの部分を取得
            first_part = header.split()[0][0:]
            new_header = f">{first_letter}{space_after}_{first_part}"

        else:
            new_header = f">{header.split()[0]}"

        # 新しいヘッダーと配列を出力ファイルに書き込む
        outfile.write(f"{new_header}\n{seq}\n")

print(f"{output_file} に保存しました。")

これの最終出力はnew_all_seq.fa

続いて各OGごとに.fastaファイルを作る。その実行スクリプトNew_Manualphylo_2.pyは以下の通り。

### New_Manualphylo_2.pyの中身

# ライブラリのインポート
import os

# ファイルのパス
og_list_file = "New_OG_list.txt"
sequence_file = "new_all_seq.fa"

# sequence_fileをメモリに読み込む
with open(sequence_file, "r") as seq_file:
    sequences = seq_file.read().splitlines()

# ヘッダーとシーケンスのマッピング
seq_dict = {}
header = ""
for line in sequences:
    if line.startswith(">"):
        header = line[1:].strip()  # ヘッダー行('>'を除く)
        seq_dict[header] = ""
    else:
        seq_dict[header] += line.strip()

# New_OG_list.txtを処理
with open(og_list_file, "r") as og_file:
    og_lines = og_file.readlines()

# 1行ごとに処理を行う
for og_line in og_lines:
    og_data = og_line.strip().split()  # 空白で区切る
    og_number = og_data[0]  # OG番号
    gene_ids = og_data[1:]  # 遺伝子IDのリスト

    # 出力ファイルを作成
    output_file = f"{og_number}.fa"
    with open(output_file, "w") as out_file:
        # 各遺伝子IDについて処理
        for gene_id in gene_ids:
            if gene_id in seq_dict:
                # 一致したヘッダーと配列データを書き込む
                out_file.write(f">{gene_id}\n")
                out_file.write(f"{seq_dict[gene_id]}\n")
    
    # 経過報告
    print(f"{og_number}.fa の作成が完了しました。")

print("すべての処理が完了しました。")

続いて各OGのファイルにMAFFTをかける。そのスクリプトalign.shは以下の通り。

### align.shの中身

#!/bin/sh
#$ -S /bin/bash
#$ -cwd
#$ -v PATH
awk '{print($1)}' $1 | while read x; do #引数に前述のOG_list.txtなどのOGリストを指定する。
    mafft --auto $x.fa > $x.maffted.fa
    trimal -in $x.maffted.fa -out $x.maffted.trimed.fa -htmlout $x.maffted.trimed.fa.html -automated1
done

MAFFTによって得られたファイルをIQ-TREEの入力に沿うよう、遺伝子IDの部分を切る。そのスクリプトManualphylo_4.pyは以下の通り。

### Manualphylo_4.pyの中身

import os

# ファイルのヘッダーを変更する関数
def modify_headers(input_file, output_file):
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        for line in infile:
            if line.startswith(">"):
                # ヘッダー行の最初の四文字を抽出して書き換え
                outfile.write(f">{line[1:5]}\n")
            else:
                outfile.write(line)

# 作業ディレクトリ内のすべての ".maffted.trimed.fa" ファイルに対して処理を適用
def process_directory(directory):
    for filename in os.listdir(directory):
        if filename.endswith(".maffted.trimed.fa"):
            input_file = os.path.join(directory, filename)
            output_file = os.path.join(directory, filename.replace(".maffted.trimed.fa", ".maffted.trimed.edit.fa"))
            modify_headers(input_file, output_file)
            print(f"Processed: {filename}")

# 実行するディレクトリを指定(例:カレントディレクトリ)
process_directory(".")

これを実行した後にmakealltree.shを作成、ジョブとして投げた。

ミトコンドリア系統樹続き

ジョブが帰ってきたので中身を見てみる。

###  ~/tools/for_orthofinder/CO1_6sp/240918_test/COI.treefile
(Agra:0.0848824451,Cass:0.1343636812,((Dpon:0.0000020989,Tcas:8.4017171493)19:0.1500110432,(Smad:0.1408388478,Sory:0.1744199629)36:0.0439910356)25:0.0494856164);

系統樹は以下の通り

tree10= read.tree("/Users/kosukesano/bio/240917_CO1.tre")
p=ggtree(tree10,  branch.length = 'none')+
  xlim(0, 5)+
  theme(text = element_text(face = "italic"))+
  geom_tiplab(fontface = 4, linesize=3.0) + # Make tip labels italic
  geom_nodelab(hjust = -0.2, node = "internal", size = 5) +
  #geom_text(aes(label=node), hjust=-.2)+
  theme_tree()
p

なんだこれ……。

トリミングした後のファイルを見てみる。

### COI.maffted.trimed.faの中身

>Agra
------------------------------------------------------------
----------tttaattttaagaagaattgtagaaaaggagctggaacaggatgaacagt
ttaccctccactttcttctaatttagctcatgaaggacttctgttgatttagctattttt
agccttcatatagccgggatttcttcaattctcggagctataaattttatttcaacagta
aatatacctcaagaagtagagcaaatacctttattttgagctgtaaaaattacagctatc
ttattactaatttctcttccagtcttagcaggggctatta-ctatactactaactgaccg
taatattaatacatcattttttgatccgcaggtggtggagacccaattctctatcaacac
ttatttcccagaagtttatattctaat-------
>Cass
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------tccagaagtatatattttaataaaaaaa
>Dpon
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------gtatcctctatccttggagctatcaattttatttctacaata
aatatacttcaggaattagatcgtttaacgttattttgagcagtaaaaattacagctatc
ttattattgttatcattaccagtattagctggagccatca-ctatacttttaacagaccg
aaatatcaatactactttttttgantcyccggtggaggagatcctattctctatcaacac
ttatttccccgaagtttacattttaataaaaaag
>Smad
ataatacttcattaaggaaattttattctattgtaacagcatcttttttatgtaatacct
gtttaattcttcttattttaagaagaattatgataaaggagcaggaacaggctgaacagt
ttatccccctttatctacaaatattgctcatgaaggtcatctgtagatctagctatcttt
agactacatatagcagggatctcttcaatcctaggagcaataaattttatttcaacaatt
aatatactataggaatttgatcaattatcattattttgagcagttaaattaacagcaatt
ctacttttattatcattacctgttttagctggagctatca-ctatattattaactgatcg
aaatattaatacttcattttttgatccgcaggagggggagaccctatcttatatcaacat
ttattt----------------------------
>Sory
atagtacatccttaaggaaattttatactattgtcacagcattttctttatgtaatacca
atttaactcttttactaataagaagatttatgaaaagggagcaggaacaggatgaaccgt
ctaccccccgctctcatccaatattgcccatgaaggacttctgttgatctggccattttc
agtttacatatagcaggaatttcatctattctaggagctattaattttattacaacagat
aatatacctcaggaatctgaacgaataaccctattttgagcagtaagaatcactgctatt
ctcctcctctttagattacctgtattagcaggagcaatca-ctatacttcttactgatcg
aaatattaatacttccttttttgatccgccggaggag-----------------------
----------------------------------
>Tcas
aaaatacgccattacggcaaggttatacacttgacccagtttttccattaggaaatacca
gtttaaattttggaaattttaacaaaaccgtaaaatgaacaacgcacggcaatccttggc
tt-tggggcgcctccttgtgcgagaattcagcaaaacgcccccaggggcacggaacagcc
gggccccaggtttcacaagttgaaggagatccagaag-----aaatttggggttgaagac
gttccaacttgaaaggtcggcgacaaaattctctacttgactgtattgatcacagctg--
------------------caggtctcggcatgtccattgagtcgtggtaccgattggcca
ataaataaacataacttgttctgtagttactatagttaag-taaattccaacaataaaat
ttattt---------------------aaacaaa

Cassがスカスカすぎる!

7種昆虫でのヘッダー行再構築

上で書いた6種のやつからmakeOGlist.pyedit_allseq.pyNew_Manualphylo_2.pyalign.shManualphylo_4.pyをコピーしてきて実行。

makealltree.shseven_makealltree.shとしてコピー(同じ名前のジョブが並んでややこしくならないように。)

0919

7種ゲノムでヘッダー修正後の系統樹作成

結果は~/tools/for_ASTRAL/Astral/240919_7spに保存した。

tree10= read.tree("/Users/kosukesano/bio/240919_7sp")
p=ggtree(tree10,  branch.length = 'none')+
  xlim(0,7)+
  theme(text = element_text(face = "italic"))+
  geom_tiplab(fontface = 4, linesize=3.0) + # Make tip labels italic
  geom_nodelab(hjust = -0.2, node = "internal", size = 5) +
  #geom_text(aes(label=node), hjust=-.2)+
  theme_tree()
p

結局変わらなかった

7種でのSuperMatrix法によるIQ-TREEを使った系統樹推定

~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03/ManualPhylo_datamakerun.pyを作成。

### makerun.pyの中身


import glob
import os

list = []
for i in glob.glob('*.maffted.trimed.edit.fa'):
        list.append(os.path.split(i)[1].rstrip())

#print(list[0])


##ls | grep "maffted.trimed.edit.fa" > otamesi.txtで、完成したOGをotamesi.txtに一行ずつ保存
##ファイルの行数をカウント。このカウント数がfor文のrangeに入る数になる

f = open("run.nex", "w")
f.write("#nexus" + "\n")
f.write("begin sets;" + "\n")
character = "charset part"
for line, i in zip(list, range(4997)):
        row = character + str(i+1) + " = " + line + ": ;"
        f.write("\t" + row + "\n")
f.write("end;" + "\n")
f.close()

これでrun.nexを作る。

続いてIQ-TREEの実行。使ったシェルスクリプトはmanualphylo.sh

### manualphylo.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 16

date
singularity exec -e /usr/local/biotools/i/iqtree:2.3.3--h21ec9f0_0 iqtree2 -sp run.nex -nt AUTO -bb 1000 -cptime 600
date

これをqsubで投げた。

7種での欠失を1つ許したASTRAL

~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03/Orthogroups下でSCOwiith0tyusyutu.pyを作成、実行した。SCOwiith0tyusyutu.pyは0905の完全コピー。

続いて~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03/SCOwithOneZero_Manualphylo_dataManualphylo_1.pyを実行した。Manualphylo_1.pyは以下の通り。

### Manualphylo_1.pyの中身

##analysis_manual.pptxの#46も参照

##AFTER you made MSA file(all_seq.fa) in DDBJ with makeMSA.sh

##時間は10secほど

import numpy as np
import pandas as pd
import os

path = "~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03/"
withpath = "../"

OGs = pd.read_table(path + "Orthogroups/Orthogroups.tsv")

##with openは相対パスしか受け付けないらしい
new = pd.DataFrame()
with open(withpath + "Orthogroups/Orthogroups.GeneCount.SingleCopyWithOneZeroOrtholog.txt", "r")            as fin:
        for line in fin:
                li = line.rstrip()
                new = pd.concat([new, OGs[OGs["Orthogroup"] == li]])
print(new) 
new.to_csv(path + "SCOwithOneZero_Manualphylo_data/OG_list.txt", sep = " ", index = False, header = False)

##OG_list.txtと同じ順番の種名リストであるspecies_list.txtを作成
##できたOG_list.txtに、DDBJで作ったall_seq.faで配列情報を与える。

li = []
allspe = OGs.columns.tolist()
allspe2 = allspe[1:len(allspe)]
with open(withpath + "SCOwithOneZero_Manualphylo_data/species_list.txt", "w") as file:
   for column_name in allspe2:
       file.write("%s\n" % column_name)

同じディレクトリでManualphylo_2.pyを実行した。結構時間かかる。

### Manualphylo_2.pyの中身

##ManualPhylo_1.pyの続き

import sys
from Bio import SeqIO

path = "../SCOwithOneZero_Manualphylo_data/"

fasta_in = sys.argv[1]                                  #1番目の引数には上記のall_seq.faなどfastaファイルを指定する
query_in = sys.argv[2]                                  #2番目の引数には上記のOG_list.txtなどオーソログファイルを指定する

for q in open(query_in, "r"):                                           #オーソログファイルを開いて1行づつ読み込む
        query = q.split()                                                       #スペース毎に切りとってリスト形式でqueryに保存する
        f = open(path + query[0], 'w')                                  #最初の列(OG名)と同じ名前のファイルを作成する
        for record in SeqIO.parse(fasta_in, 'fasta'):   #fastaファイルを開くSeqIOを使ってパースする(1項目づつ読み込む)
                id_part = record.id                                     #fastaのID部分を読み込む
                desc_part = record.description                  #fastaのdescription部分を読み込む
                seq = record.seq                                        #fastaの配列部分を読み込む
                for i in range(len(query)):                         #オーソログファイル中の各OGに含まれる配列数を数えて、その分繰り返す(python2の人はrange を x rangeにする)
                        if id_part == query[i] or desc_part == query[i] :                   #オーソログファイルの配列descriptionとfastaの配列descriptionが一致したら、、、
                                fasta_seq = '>' + id_part + ' ' + desc_part + '\n' + seq + '\n'         #fasta形式に整え
                                print(fasta_seq)                                        #標準出力にfastaを出力(進行状況把握用)
                                f.write(str(fasta_seq))                             #各OGファイルにfastaを出力
        f.close()

##できたOGファイルは、align.shやOG_list.txtと同じ場所に
##align.shのある場所までいき、作動。cwdを231016/ManualPhylo_dataにしないとtrimalが作動せず、イライラ

実行のコマンドは以下の通り

python ManualPhylo_2.py ../make_philo_tree/all_seq.fa OG_list.txt

0930

Elaeidobius kamerunicusRepeatMasker

~/tools/for_softmask/Ekam_softmaskで以下のスクリプトを作成、実行した。

### Ekam_RepeatMasker.sh

#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 24
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date

source ~/tools/pyenv_env/EDTA_profile

RepeatMasker -pa 6 -lib\
        /home/kosukesano/tools/for_softmask/Ekam_softmask/RM_2331.MonSep230400512024\
        /home/kosukesano/tools/for_softmask/nama_data/Ekam_data/GCA_014849505.1/GCA_014849505.1_AAL_Ekam_1.0_genomic.fna

date

Pissodes strobiRepeatMasker

~/tools/for_softmask/Pstr_softmaskで以下のスクリプトを作成、実行した。

### Pstr_RepeatMasker.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 24
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date

source ~/tools/pyenv_env/EDTA_profile

RepeatMasker -pa 6 -lib\
        /home/kosukesano/tools/for_softmask/Pstr_softmask/RM_77685.SunSep222227102024/consensi.fa.classified\
        /home/kosukesano/tools/for_softmask/nama_data/Pstr_data/GCA_016904865.1/GCA_016904865.1_GSC_weevil_1.0_genomic.fna

date

IQ-TREE出力の系統樹についてCAFEの前準備

:~/bio/for_cafe$ mkdir 0930_orthofinder_data
:~/bio/for_cafe$ cd 0930_orthofinder_data/
:~/bio/for_cafe/0930_orthofinder_data$ scp kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_orthofinder/RemakeHedder_6sp/OrthoFinder/Results_Sep19/Orthogroups/Orthogroups.GeneCount.tsv ../0930_orthofinder_data
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
|  ..o.o...*   o+ |
|   . . ..= + o* o|
|       .  = oB +.|
|      +.oo .+E+o.|
|      .*S.  o.o.+|
|      .o. .  . .+|
|      ..   +  . o|
|        . ..oo . |
|         . .=.   |
+----[SHA256]-----+
Orthogroups.GeneCount.tsv                                                                                                                       100%  354KB   4.7MB/s   00:00    
:~/bio/for_cafe/0930_orthofinder_data$ ls
Orthogroups.GeneCount.tsv
:~/bio/for_cafe/0930_orthofinder_data$
Orthologs_raw <- 
  read_tsv(paste("/Users/kosukesano/bio/for_cafe/0930_orthofinder_data/Orthogroups.GeneCount.tsv", sep = "/"))

##Enzanはorthogroupのなかで遺伝子数が変なやつを検出するためのmatrix
Enzan <- Orthologs_raw %>%
  select(!c(Orthogroup, Total)) %>%
  t()

##saidai, saisyouは各Orthogroupの中で、各種が持っているコピー数の最大値及び最小値を記したdf
saidai <- Enzan %>% 
  apply(2, max) %>%
  as.data.frame() %>%
  rename(max_real = ".")
saisyou <- Enzan %>% 
  apply(2, min) %>%
  as.data.frame() %>%
  rename(min_real = ".")

##Orthologs_1は各Orthogroupsの最大値、最小値もくっつけたdf
Orthologs_1 <- Orthologs_raw %>% select(!c(Total)) %>%
  bind_cols(saidai, saisyou)

##最大値と最小値の差
Orthologs_2 <-Orthologs_1 %>% 
  mutate(sa = max_real - min_real) %>%
  filter(max_real != min_real) %>%
  filter(sa < 50)


##外れ値と遺伝子ファミリー数が全種で共通の行を省いた。最後に1列目を複製し列名をいじって、CAFEへのインプットデータの出来上がり。
Orthologs_3 <- Orthologs_2 %>% 
  mutate(Description = Orthogroup, ID = Orthogroup) %>%
  relocate(Description, ID) %>%
  select(!c(Orthogroup, max_real, min_real, sa))

Orthologs_3 %>% 
  write_tsv(paste("/Users/kosukesano/bio/for_cafe/0930_orthofinder_data/Orthogroups.GeneCount2.tsv", sep = "/"))#, quote = FALSE) #,row.names = FALSE)
##Did you finish creating ultrametric tree with makeultrametric.R?
### ~/bio/for_cafe/0930_orthofinder_data/IQTREE_6sp_out.txt
(Agra:0.2063145278,((Cass:0.1854832267,Dpon:0.2312244082)97:0.0164557860,Smad:0.1400667500)100:0.0195008540,(Sory:0.2105736237,Tcas:0.5169602661)100:0.0679653321);
tree = read.tree("/Users/kosukesano/bio/for_cafe/0930_orthofinder_data/IQTREE_6sp_out.txt")
mrca = getMRCA(tree, tip=c('Tcas', 'Sory')) #分岐年代推定に使うノードの指定
tree2 = chronopl(
  tree,
  100000,
  age.min = 152.3,  # 推定分岐年代の最小値(MYA)
  age.max = 236.2,  # 推定分岐年代の最大値(MYA)
  node = mrca,   # getMRCAで指定したノード
  S = 1,
  tol = 1e-20,
  CV = FALSE,
  eval.max = 500,
  iter.max = 500
)
is.ultrametric(tree2)  # ultrametricかどうか確認
write.tree(tree2, file = "0930_orthofinder_data/tree_IQTREE_ultrametric.nwk")  # ultrametric系統樹の保存
:~/bio/for_cafe/0930_orthofinder_data$ scp Orthogroups.GeneCount2.tsv kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_cafe/6sp_useIQTREE
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
|  ..o.o...*   o+ |
|   . . ..= + o* o|
|       .  = oB +.|
|      +.oo .+E+o.|
|      .*S.  o.o.+|
|      .o. .  . .+|
|      ..   +  . o|
|        . ..oo . |
|         . .=.   |
+----[SHA256]-----+
Orthogroups.GeneCount2.tsv                                                                                                                      100%  400KB   5.9MB/s   00:00    
:~/bio/for_cafe/0930_orthofinder_data$ scp tree_IQTREE_ultrametric.nwk kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_cafe/6sp_useIQTREE
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
|  ..o.o...*   o+ |
|   . . ..= + o* o|
|       .  = oB +.|
|      +.oo .+E+o.|
|      .*S.  o.o.+|
|      .o. .  . .+|
|      ..   +  . o|
|        . ..oo . |
|         . .=.   |
+----[SHA256]-----+
tree_IQTREE_ultrametric.nwk                                                                                                                     100%  143    14.8KB/s   00:00    
:~/bio/for_cafe/0930_orthofinder_data$

IQ-TREE出力の系統樹を使ったCAFE5の実行

kosukesano@at138:~/tools/for_cafe/6sp_useIQTREE$ singularity exec -e /usr/local/biotools/c/cafe:5.0.0--h5b5514e_2 cafe5 -i Orthogroups.GeneCount2.tsv -t tree_IQTREE_ultrametric.nwk 

Command line: /usr/local/bin/cafe5 -i Orthogroups.GeneCount2.tsv -t tree_IQTREE_ultrametric.nwk 

Filtering families not present at the root from: 12784 to 8360

No root family size distribution specified, using uniform distribution

Optimizer strategy: Nelder-Mead with similarity cutoff
Iterations: 300
Expansion: 2
Reflection: 1
.
.
.
.
.
59 values were attempted (0% rejected)

Inferring processes for Base model
Score (-lnL): 115874.33817573
Maximum possible lambda for this topology: 0.0020213044143122
Computing pvalues...
done!

Starting reconstruction processes for Base model
Done!

kosukesano@at138:~/tools/for_cafe/6sp_useIQTREE$ ls
Orthogroups.GeneCount2.tsv  results  tree_IQTREE_ultrametric.nwk
kosukesano@at138:~/tools/for_cafe/6sp_useIQTREE$ cd results/
kosukesano@at138:~/tools/for_cafe/6sp_useIQTREE/results$ ls
Base_asr.tre  Base_branch_probabilities.tab  Base_change.tab  Base_clade_results.txt  Base_count.tab  Base_family_likelihoods.txt  Base_family_results.txt  Base_results.txt
kosukesano@at138:~/tools/for_cafe/6sp_useIQTREE/results$ 
### ~/tools/for_cafe/6sp_useIQTREE/results/Base_clade_results.txt 

#Taxon_ID       Increase        Decrease
Agra<8> 2793    925
<7>     1       53
<5>     1       12
Cass<1> 499     3639
Dpon<0> 2982    1374
Smad<4> 886     3180
<6>     861     573
Sory<3> 2191    1232
Tcas<2> 1853    1984

ASTRALの再実行

6種のゲノム全てを使ったASTRALを改めて行った。結果は~/tools/for_ASTRAL/Astral/240930_6spに格納。使ったスクリプトは以下の通り。

#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 16
echo start at
date

java -Xmx2G -jar astral.5.7.8.jar \
    -i /home/kosukesano/tools/for_ASTRAL/Astral/data/240930_6sp/modified_trees.nwk \
    -o /home/kosukesano/tools/for_ASTRAL/Astral/240930_6sp/out.tre \
    2>/home/kosukesano/tools/for_ASTRAL/Astral/240930_6sp/out.log

date

この出力は末端枝の長さが不明なので、暫定的に末端枝を1とするスクリプトを用意し実行した。スクリプトのソースはこちら

### ~/tools/for_ASTRAL/Astral/240930_6sp/makelength.pyの中身

#!/usr/bin/env python
'''
Created on Jun 3, 2011

@author: smirarab
'''
import dendropy
import sys
import os
import copy
import os.path

if __name__ == '__main__':

    if len(sys.argv) < 3: 
        print("USAGE: [postfix|-|--] treefile")
        sys.exit(1)
    stdout = False
    if sys.argv[1] == "-":
        resultsFile = sys.stdout
        stdout = True
    elif sys.argv[1] == "--":
        postfix = "blen"
    else:
        postfix = sys.argv[1]
    
    c={}
    for treeName in sys.argv[2:]:
        if not stdout:
            resultsFile=open("%s.%s" % (treeName, postfix),'w')
        trees = dendropy.TreeList.get_from_path(treeName, 'newick')
        for tree in trees:
            for e in tree.postorder_edge_iter():
                if not e.length:
                    e.length = 1
        sys.stderr.write("writing results to " + resultsFile.name + "\n")        
        trees.write(file=resultsFile,schema='newick')

これを以下のコマンドで実行。


kosukesano@at138:~/tools/for_ASTRAL/Astral/240930_6sp$ python makelength.py blen out.tre 
writing results to out.tre.blen
kosukesano@at138:~/tools/for_ASTRAL/Astral/240930_6sp$

これで書きかわったファイルout.tre.blenができた。

このファイルをローカルで/Users/kosukesano/bio/for_cafe/0930_orthofinder_data/ASTRAL_6sp.txtとして保存し、以下のコードを実行した。

tree = read.tree("/Users/kosukesano/bio/for_cafe/0930_orthofinder_data/ASTRAL_6sp.txt")
ASTRAL_6sp = root(tree, outgroup = "Tcas")
write.tree(phy=ASTRAL_6sp, file='0930_orthofinder_data/ASTRAL_6sp_after_root_outgroup.txt')
mrca = getMRCA(ASTRAL_6sp, tip=c('Tcas', 'Sory')) #分岐年代推定に使うノードの指定
tree2 = chronopl(
  tree,
  100000,
  age.min = 152.3,  # 推定分岐年代の最小値(MYA)
  age.max = 236.2,  # 推定分岐年代の最大値(MYA)
  node = mrca,   # getMRCAで指定したノード
  S = 1,
  tol = 1e-20,
  CV = FALSE,
  eval.max = 500,
  iter.max = 500
)
is.ultrametric(tree2)  # ultrametricかどうか確認
write.tree(tree2, file = "0930_orthofinder_data/tree_ASTRAL_ultrametric.nwk")  # ultrametric系統樹の保存

これによってできた系統樹を~/tools/for_cafe/6sp_useASTRALにコピーした。

ASTRAL出力の系統樹を使ったCAFE5の実行

kosukesano@at138:~/tools/for_cafe/6sp_useASTRAL$ singularity exec -e /usr/local/biotools/c/cafe:5.0.0--h5b5514e_2 cafe5 -i ../6sp_useIQTREE/Orthogroups.GeneCount2.tsv -t tree_ASTRAL_u
ltrametric.nwk 

Command line: /usr/local/bin/cafe5 -i ../6sp_useIQTREE/Orthogroups.GeneCount2.tsv -t tree_ASTRAL_ultrametric.nwk 

Filtering families not present at the root from: 12784 to 8975

No root family size distribution specified, using uniform distribution

Optimizer strategy: Nelder-Mead with similarity cutoff
Iterations: 300
Expansion: 2
Reflection: 1
.
.
.
.
.
Completed 24 iterations
Time: 0H 0M 3S
Best match is: 0.0023045008755977
Final -lnL: 126872.43351184

51 values were attempted (0% rejected)

Inferring processes for Base model
Score (-lnL): 126872.43351184
Maximum possible lambda for this topology: 0.004233700254022
Computing pvalues...
done!

Starting reconstruction processes for Base model
Done!

kosukesano@at138:~/tools/for_cafe/6sp_useASTRAL$ 
### ~/tools/for_cafe/6sp_useASTRAL/results/Base_clade_results.txt

#Taxon_ID       Increase        Decrease
Cass<0> 592     3547
Agra<9> 3187    914
<8>     27      1188
<7>     710     754
Sory<5> 2319    1113
Tcas<4> 1923    1958
<6>     1       194
Dpon<3> 3181    1428
Smad<1> 1013    3123

IQ-TREE出力の系統樹を使ったPAMLの実行

~/tools/for_paml/IQTREE_6sp/ディレクトリを作成、その下でdataディレクトリを作った。dataにはツリーファイルを格納した。

kosukesano@at138:~/tools/for_paml/IQTREE_6sp$ ls data
tree_ASTRAL_ultrametric.nwk  tree_IQTREE_ultrametric.nwk

また、~/tools/for_paml/IQTREE_6sp/bsAディレクトリを作成。その下でrun_paml.shtemplate.ctlを作った。

### run_paml.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l gpu

# ディレクトリの設定
input_dir="/home/kosukesano/tools/for_paml/6sp/data/SCO_plusname"
bsA_dir="/home/kosukesano/tools/for_paml/IQTREE_6sp/bsA"
result_dir="$bsA_dir/result"
template_ctl="$bsA_dir/template.ctl"

# 出力ディレクトリが存在しない場合は作成
mkdir -p "$result_dir"

# テンプレートの制御ファイルを読み込む
ctl_template=$(cat "$template_ctl")

# ディレクトリ内の_maffted_fixed.fastaファイルを処理
for file in "$input_dir"/*_maffted_fixed.fasta; do
  if [[ -f "$file" ]]; then
    base_name=$(basename "$file" .fasta)
    outfile_path="$result_dir/${base_name}_branch_alt"

    # 一時的な制御ファイルの内容を生成
    ctl_content="${ctl_template//<SEQFILE>/$file}"
    ctl_content="${ctl_content//<OUTFILE>/$outfile_path}"

    # 一時的な制御ファイルを作成
    ctl_path="$bsA_dir/bsA.ctl"
    echo "$ctl_content" > "$ctl_path"

    # PAMLを実行
    singularity exec -e /usr/local/biotools/p/paml:4.9--h779adbc_6 codeml "$ctl_path"

    echo "Processed file: $file, output: $outfile_path"
  fi
done
###template.ctlの中身
seqfile = <SEQFILE>
treefile = /home/kosukesano/tools/for_paml/IQTREE_6sp/data/tree_IQTREE_ultrametric.nwk
outfile = <OUTFILE>

noisy = 9
verbose = 1
runmode = 0
seqtype = 1
CodonFreq = 2
clock = 0
model = 2
NSsites = 2
fix_omega = 0
omega = 1
icode = 0
fix_kappa = 0
kappa = 2
fix_alpha = 1
alpha = .0
Malpha = 0
ncatG = 4
getSE = 0
RateAncestor = 0
method = 0
fix_blength = 0

run_paml.shqsubで投げた。

続いて帰無仮説の方について、~/tools/for_paml/IQTREE_6sp/bs_nullディレクトリを作成。その下でbsN_run_paml.shbsN_template.ctlを作った。

ASTRAL出力の系統樹を使ったPAMLの実行

kosukesano@at138:~/tools/for_paml/ASTRAL_6sp$ ls
bsA  bs_null  data  rst  rst1  rub  run_paml.sh.e26903588  run_paml.sh.o26903588
kosukesano@at138:~/tools/for_paml/ASTRAL_6sp$ ls bsA/
2NG.dN  2NG.dS  2NG.t  4fold.nuc  bsA.ctl  lnf  result  rst  rst1  rub  run_paml.sh  run_paml.sh.e26903683  run_paml.sh.o26903683  template.ctl
kosukesano@at138:~/tools/for_paml/ASTRAL_6sp$ ls bs_null/
2NG.dN  2NG.t      bsA.ctl          bsN_run_paml.sh.e26903612  bsN_run_paml.sh.o26903612  bsN_template.ctl  result  rst1
2NG.dS  4fold.nuc  bsN_run_paml.sh  bsN_run_paml.sh.e26903688  bsN_run_paml.sh.o26903688  lnf               rst     rub
kosukesano@at138:~/tools/for_paml/ASTRAL_6sp$ 

2024年10月

1004

scorpion内でpyenvを立ち上げる。

pyenvのインストール

(base) dendezia@scorpion:~$ git clone https://github.com/yyuu/pyenv.git ~/.pyenv
Cloning into '/home/dendezia/.pyenv'...
remote: Enumerating objects: 25118, done.
remote: Counting objects: 100% (1852/1852), done.
remote: Compressing objects: 100% (127/127), done.
remote: Total 25118 (delta 1772), reused 1732 (delta 1724), pack-reused 23266 (from 1)
Receiving objects: 100% (25118/25118), 5.09 MiB | 9.08 MiB/s, done.
Resolving deltas: 100% (16935/16935), done.
(base) dendezia@scorpion:~$ ls -a
.  ..  .R  .bash_history  .bash_logout  .bashrc  .cache  .conda  .dotnet  .lesshst  .profile  .pyenv  .ssh  .vscode-server  .wget-hsts  .zshrc  old_envilonment_until20241004
(base) dendezia@scorpion:~$ ls .pyenv/
CHANGELOG.md  CONDUCT.md       Dockerfile  MAINTENANCE.md  README.md  completions  man      pyenv.d  terminal_output.png
COMMANDS.md   CONTRIBUTING.md  LICENSE     Makefile        bin        libexec      plugins  src      test
(base) dendezia@scorpion:~$ 

pyenv用のプロファイルの作成

(base) dendezia@scorpion:~$ mkdir pyenv_conda_environment
(base) dendezia@scorpion:~$ cd pyenv_conda_environment/
(base) dendezia@scorpion:~/pyenv_conda_environment$ nano .pyenv_profile
(base) dendezia@scorpion:~/pyenv_conda_environment$ ls -a
.  ..  .pyenv_profile
(base) dendezia@scorpion:~/pyenv_conda_environment$ source .pyenv_profile 
(base) dendezia@scorpion:~/pyenv_conda_environment$ pyenv
pyenv 2.4.14-1-g468dc811
Usage: pyenv <command> [<args>]

Some useful pyenv commands are:
   --version   Display the version of pyenv
   commands    List all available pyenv commands
   exec        Run an executable with the selected Python version
   global      Set or show the global Python version(s)
   help        Display help for a command
   hooks       List hook scripts for a given pyenv command
   init        Configure the shell environment for pyenv
   install     Install a Python version using python-build
   latest      Print the latest installed or known version with the given prefix
   local       Set or show the local application-specific Python version(s)
   prefix      Display prefixes for Python versions
   rehash      Rehash pyenv shims (run this after installing executables)
   root        Display the root directory where versions and shims are kept
   shell       Set or show the shell-specific Python version
   shims       List existing pyenv shims
   uninstall   Uninstall Python versions
   version     Show the current Python version(s) and its origin
   version-file   Detect the file that sets the current pyenv version
   version-name   Show the current Python version
   version-origin   Explain how the current Python version is set
   versions    List all Python versions available to pyenv
   whence      List all Python versions that contain the given executable
   which       Display the full path to an executable

See `pyenv help <command>' for information on a specific command.
For full documentation, see: https://github.com/pyenv/pyenv#readme
(base) dendezia@scorpion:~/pyenv_conda_environment$ 

pyenvを用いたAnaconda3のインストール

### condainstall.shの中身


#$ -S /bin/bash
#$ -cwd
date

echo starting at date
source ~/pyenv_conda_environment/.pyenv_profile
pyenv install anaconda3-2020.11

date

これをqsubで投げた

結果

~/.pyenv/versions/anaconda3-2020.11ディレクトリができた。

(base) dendezia@scorpion:~/pyenv_conda_environment$ cd ~/.pyenv/versions/
(base) dendezia@scorpion:~/.pyenv/versions$ ls
anaconda3-2020.11
(base) dendezia@scorpion:~/.pyenv/versions$ ls anaconda3-2020.11/
LICENSE.txt  compiler_compat  condabin  envs  include  lib      man      phrasebooks  plugins  resources  share  ssl           var
bin          conda-meta       doc       etc   info     libexec  mkspecs  pkgs         qml      sbin       shell  translations  x86_64-conda_cos6-linux-gnu
(base) dendezia@scorpion:~/.pyenv/versions$

BRAKER3用のAnaconda環境であるbrakerを作成

base) dendezia@scorpion:~/tool/pyenv_env$ conda create -n braker python=3.9
Collecting package metadata (current_repodata.json): done
Solving environment: done


==> WARNING: A newer version of conda exists. <==
  current version: 4.9.2
  latest version: 24.9.1

Please update conda by running
.
.
.

braker環境用のプロファイル、braker_profile~/tool/pyenv_envの下に作成。

### braker_profileの中身

source ~/.bash_profile
source ~/pyenv_conda_environment/.pyenv_profile
pyenv global anaconda3-2020.11



# >>> conda initialize >>>
# !! Contents within this block are managed by 'conda init' !!
__conda_setup="$('/home/dendezia/.pyenv/versions/anaconda3-2020.11/bin/conda' 'shell.bash' 'hook' 2> /dev/null)"
if [ $? -eq 0 ]; then
    eval "$__conda_setup"
else
    if [ -f "/home/dendezia/.pyenv/versions/anaconda3-2020.11/etc/profile.d/conda.sh" ]; then
        . "/home/dendezia/.pyenv/versions/anaconda3-2020.11/etc/profile.d/conda.sh"
    else
        export PATH="/home/dendezia/.pyenv/versions/anaconda3-2020.11/bin:$PATH"
    fi
fi
unset __conda_setup
# <<< conda initialize <<<

conda activate braker

また、scorpion内に.bash_profileがなかったのでそれも作成。

### .bash_profileの中身

# .bash_profile

# Get the aliases and functions
if [ -f ~/.bashrc ]; then
    . ~/.bashrc
fi

# User specific environment and startup programs

PATH=$PATH:$HOME/.local/bin:$HOME/bin

export PATH

ここまでやってbraker_profilesourceするとbraker環境に入れる。

ただ。現状ではまだBRAKERは入っておらず、空の環境があるだけ。

1007

IQ-TREEを使ったCAFE5の結果の処理

ローカルの~/bioCAFE5resultのディレクトリをコピーした。

:~/bio/for_cafe/241007_cafe_original_data$ scp -r kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_cafe/6sp_useIQTREE/results ~/bio/for_cafe/241007_cafe_original_data/useIQTREE
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
|  ..o.o...*   o+ |
|   . . ..= + o* o|
|       .  = oB +.|
|      +.oo .+E+o.|
|      .*S.  o.o.+|
|      .o. .  . .+|
|      ..   +  . o|
|        . ..oo . |
|         . .=.   |
+----[SHA256]-----+
Base_clade_results.txt                                                                                                                               100%  163     5.0KB/s   00:00    
Base_asr.tre                                                                                                                                         100% 1422KB  10.7MB/s   00:00    
Base_count.tab                                                                                                                                       100%  245KB   2.6MB/s   00:00    
Base_results.txt                                                                                                                                     100%  163     5.1KB/s   00:00    
Base_family_likelihoods.txt                                                                                                                          100%  154KB   1.2MB/s   00:00    
Base_family_results.txt                                                                                                                              100%  146KB   2.3MB/s   00:00    
Base_branch_probabilities.tab                                                                                                                        100%   72KB   1.9MB/s   00:00    
Base_change.tab                                                                                                                                      100%  327KB   5.1MB/s   00:00    
:~/bio/for_cafe/241007_cafe_original_data$ ls
useIQTREE

同じくOrthofinderの出力もコピーした。

:~/bio/for_cafe/0930_orthofinder_data$ scp -r kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_orthofinder/RemakeHedder_6sp/OrthoFinder/Results_Sep19/Orthogroups/Orthogroups.tsv ~/bio/for_cafe/0930_orthofinder_data/
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
|  ..o.o...*   o+ |
|   . . ..= + o* o|
|       .  = oB +.|
|      +.oo .+E+o.|
|      .*S.  o.o.+|
|      .o. .  . .+|
|      ..   +  . o|
|        . ..oo . |
|         . .=.   |
+----[SHA256]-----+
Orthogroups.tsv                                                                                                                                      100% 2350KB  15.7MB/s   00:00    
:~/bio/for_cafe/0930_orthofinder_data$ ls
ASTRAL_6sp.txt                     IQTREE_6sp_out.txt                 Orthogroups.GeneCount2.tsv         tree_ASTRAL_ultrametric.nwk
ASTRAL_6sp_after_root_outgroup.txt Orthogroups.GeneCount.tsv          Orthogroups.tsv                    tree_IQTREE_ultrametric.nwk
:~/bio/for_cafe/0930_orthofinder_data$ 

これを元に以下のスクリプトを実行した。

library(tidyverse)
Deg<-read.csv("Deg/DEG_ovary_vs_body_DESeq2.csv", sep=",")
Plami<-read.csv("241007_cafe_original_data/useIQTREE/Base_change.tab", sep="\t")

View(Plami)

# ファイルを読み込む
file_path <- "241007_cafe_original_data/useIQTREE/Base_asr.tre"
lines <- readLines(file_path)# linesはCAFEが推定した各遺伝子ファミリーの系統樹。有意な増減があったところには*がふられている。
print(lines)

# TREESセクションのみを抽出する
trees_start <- which(grepl("BEGIN TREES;", lines))
trees_end <- which(grepl("END;", lines))
trees_lines <- lines[(trees_start + 1):(trees_end - 1)]
print(trees_lines)

# 不要なスペースを削除
trees_lines <- gsub("^\\s+|\\s+$", "", trees_lines)

# データフレームに変換
library(tibble)
trees_df <- tibble(Tree = trees_lines)

ex=trees_df|>###マダラで優位に増減したOGのOG番号を抽出したファイル
  #lines|> 
  tidyr::separate(Tree, into = c("OG_num", "tree"), sep = r"(\s=\s)")|>#系統樹の文字列をOG番号の列とツリーの列に分割
  dplyr::mutate(OG_num = stringr::str_extract(OG_num, "OG\\d+")) |>#OG番号の列の余計な文字を除去
  dplyr::mutate(tree = stringr::str_extract(tree, "Smad<4>\\*_")) |>#有意な差のある遺伝子ファミリー(*がついてるやつ)のみを抽出
  dplyr::mutate(tree = stringr::str_replace(tree, "Smad<4>\\*_", "significant")) |>#わかりやすいようにsignificantに変更
  dplyr::filter(tree == "significant") |>#significantのみを抽出
  print()

View(ex)
#################################################################

Plami2=Plami |>###マダラで増加した0Gの0G番号を抽出したファイル
  dplyr::select("FamilyID","Smad.4.") |>#OG番号の列とマダラでの遺伝子数の増減量が書いてある列のみを抽出
  dplyr::mutate(Smad.4. = stringr::str_extract(Smad.4., r"(^\d+)")) |>#マダラの遺伝子量増減の列のうち、数字のみのもの(-がついておらず、遺伝子数が増加しているもの)を抽出
  tidyr::drop_na()|>
  dplyr::filter(Smad.4. != 0) |>#遺伝子数の増加分が0のものを除去
  print()
View(Plami2)

#################################################################

df=dplyr::inner_join(Plami2, ex, by = c(FamilyID = "OG_num"))|>###マダラで優位に増加したOGのOG番号を抽出したファイル
  print()

##################################################################
# ファイルパスの指定
orthogroups_file <- "0930_orthofinder_data/Orthogroups.tsv"

# Orthogroups.tsvの読み込み
orthogroups <- ### OG番号とそれに対応するマダラ遺伝子IDのファイル
  read.delim(orthogroups_file, header=FALSE, sep="\t", 
             #stringsAsFactors=FALSE,
             #col.names = "Data"
             skip=1
  )|>
  dplyr::select("V1", "V5")

# データの最初の数行を表示して確認
head(orthogroups)
View(orthogroups)

df2=dplyr::left_join(df, orthogroups, by = c(FamilyID = "V1"))|>#マダラで有意に増加したOGのOG番号とマダラの遺伝子IDを紐付ける
  dplyr::select(!c(Smad.4., tree)) |>
  print()
View(df2)

################################################################

# V5列の遺伝子IDをカンマで区切って、新しいデータフレームを作成

df_expanded <- df2 %>%###マダラでのみ増加した遺伝子のgene_IDとOG番号
  separate_rows(V5, sep = ", ") %>%
  rename(gene_ID = V5, family_ID = FamilyID)|>
  dplyr::mutate(gene_ID = stringr::str_replace(gene_ID, "^Smad_", "")) |>
  print()
###############################################################
### CAFE5でマダラでのみ増加した遺伝子とその機能のファイル、df3

fa<-read.csv("/Users/kosukesano/bio/functional_annotation/merged_with_gene_function.csv", sep=",")
View(fa)

df3=dplyr::left_join(df_expanded, fa, by = c(gene_ID = "Madara"))|>###完成系
  print()

View(df3)

結果として、104個の遺伝子ファミリー、584個の遺伝子が、有意に増加した遺伝子として検出された。

ASTRALを使ったCAFE5の結果の処理

ローカルの~/bioに同じくCAFE5resultディレクトリをコピーした。


:~/bio/for_cafe/0930_orthofinder_data$ scp -r kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_cafe/6sp_useASTRAL/results ~/bio/for_cafe/241007_cafe_original_data/useASTRAL
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
|  ..o.o...*   o+ |
|   . . ..= + o* o|
|       .  = oB +.|
|      +.oo .+E+o.|
|      .*S.  o.o.+|
|      .o. .  . .+|
|      ..   +  . o|
|        . ..oo . |
|         . .=.   |
+----[SHA256]-----+
Base_clade_results.txt                                                                                                                               100%  168     5.0KB/s   00:00    
Base_asr.tre                                                                                                                                         100% 1702KB  10.7MB/s   00:00    
Base_count.tab                                                                                                                                       100%  281KB   3.8MB/s   00:00    
Base_results.txt                                                                                                                                     100%  162     4.6KB/s   00:00    
Base_family_likelihoods.txt                                                                                                                          100%  165KB   2.5MB/s   00:00    
Base_family_results.txt                                                                                                                              100%  157KB   2.3MB/s   00:00    
Base_branch_probabilities.tab                                                                                                                        100%   73KB 937.3KB/s   00:00    
Base_change.tab                                                                                                                                      100%  377KB   4.8MB/s   00:00    
:~/bio/for_cafe/0930_orthofinder_data$ 

これを元に以下のスクリプトを実行した。

################################################################################
### ASTRALを使用

A_df1=read.csv("241007_cafe_original_data/useASTRAL/Base_change.tab", sep="\t")
print(A_df1)

A_lines=readLines("241007_cafe_original_data/useASTRAL/Base_asr.tre")

# TREESセクションのみを抽出する
trees_start <- which(grepl("BEGIN TREES;", A_lines))
trees_end <- which(grepl("END;", A_lines))
trees_lines <- lines[(trees_start + 1):(trees_end - 1)]
trees_lines <- gsub("^\\s+|\\s+$", "", trees_lines)
print(trees_lines)

# データフレームに変換
trees_df <- tibble(Tree = trees_lines)

ex=trees_df|>###マダラで優位に増減したOGのOG番号を抽出したファイル
  #lines|> 
  tidyr::separate(Tree, into = c("OG_num", "tree"), sep = r"(\s=\s)")|>#系統樹の文字列をOG番号の列とツリーの列に分割
  dplyr::mutate(OG_num = stringr::str_extract(OG_num, "OG\\d+")) |>#OG番号の列の余計な文字を除去
  dplyr::mutate(tree = stringr::str_extract(tree, "Smad<4>\\*_")) |>#有意な差のある遺伝子ファミリー(*がついてるやつ)のみを抽出
  dplyr::mutate(tree = stringr::str_replace(tree, "Smad<4>\\*_", "significant")) |>#わかりやすいようにsignificantに変更
  dplyr::filter(tree == "significant") |>#significantのみを抽出
  print()

View(ex)

#################################################################

A_df2=A_df1 |>###マダラで増加した0Gの0G番号を抽出したファイル
  dplyr::select("FamilyID","Smad.1.") |>#OG番号の列とマダラでの遺伝子数の増減量が書いてある列のみを抽出
  dplyr::mutate(Smad.1. = stringr::str_extract(Smad.1., r"(^\d+)")) |>#マダラの遺伝子量増減の列のうち、数字のみのもの(-がついておらず、遺伝子数が増加しているもの)を抽出
  tidyr::drop_na()|>
  dplyr::filter(Smad.1. != 0) |>#遺伝子数の増加分が0のものを除去
  print()
View(Plami2)

#################################################################

A_df3=dplyr::inner_join(A_df2, ex, by = c(FamilyID = "OG_num"))|>###マダラで優位に増加したOGのOG番号を抽出したファイル
  print()

##################################################################
# ファイルパスの指定
orthogroups_file <- "0930_orthofinder_data/Orthogroups.tsv"

# Orthogroups.tsvの読み込み
orthogroups <- ### OG番号とそれに対応するマダラ遺伝子IDのファイル
  read.delim(orthogroups_file, header=FALSE, sep="\t", 
             #stringsAsFactors=FALSE,
             #col.names = "Data"
             skip=1
  )|>
  dplyr::select("V1", "V5")

# データの最初の数行を表示して確認
head(orthogroups)
View(orthogroups)

A_df4=dplyr::left_join(A_df3, orthogroups, by = c(FamilyID = "V1"))|>#マダラで有意に増加したOGのOG番号とマダラの遺伝子IDを紐付ける
  dplyr::select(!c(Smad.1., tree)) |>
  print()
View(A_df4)

################################################################

# V5列の遺伝子IDをカンマで区切って、新しいデータフレームを作成

A_df5 <- A_df4 %>%###マダラでのみ増加した遺伝子のgene_IDとOG番号
  separate_rows(V5, sep = ", ") %>%
  rename(gene_ID = V5, family_ID = FamilyID)|>
  dplyr::mutate(gene_ID = stringr::str_replace(gene_ID, "^Smad_", "")) |>
  print()

###############################################################
### CAFE5でマダラでのみ増加した遺伝子とその機能のファイル、df3

fa<-read.csv("/Users/kosukesano/bio/functional_annotation/merged_with_gene_function.csv", sep=",")
View(fa)

A_df6=dplyr::left_join(A_df5, fa, by = c(gene_ID = "Madara"))|>###完成系
  print()

View(A_df6)

################################################################

まとめ

IQTREE系統樹を使ったCAFE5にて、マダラで有意に増加した遺伝子ファミリーに含まれる遺伝子

# A tibble: 584 × 10
   family_ID gene_ID   Ecoli Ecol_GeneFunction Dmelanogaster Dmel_GeneFunction
   <chr>     <chr>     <chr> <chr>             <chr>         <chr>            
 1 OG0000006 g10906.t1 ""    ""                ""            ""               
 2 OG0000006 g11758.t1 ""    ""                ""            ""               
 3 OG0000006 g12212.t1 ""    ""                ""            ""               
 4 OG0000006 g12251.t1 ""    ""                ""            ""               
 5 OG0000006 g12781.t1 ""    ""                ""            ""               
 6 OG0000006 g13547.t1 ""    ""                ""            ""               
 7 OG0000006 g1616.t1  ""    ""                ""            ""               
 8 OG0000006 g2495.t1  ""    ""                ""            ""               
 9 OG0000006 g3400.t1  ""    ""                ""            ""               
10 OG0000006 g3400.t2  ""    ""                ""            ""               
# ℹ 574 more rows
# ℹ 4 more variables: Tcastaneum <chr>, Tcas_GeneFunction <chr>, Soryzae <chr>,
#   Sory_GeneFunction <chr>

ASTRAL系統樹を使ったCAFE5にて、マダラで有意に増加した遺伝子ファミリーに含まれる遺伝子

# A tibble: 584 × 10
   family_ID gene_ID   Ecoli Ecol_GeneFunction Dmelanogaster Dmel_GeneFunction
   <chr>     <chr>     <chr> <chr>             <chr>         <chr>            
 1 OG0000006 g10906.t1 ""    ""                ""            ""               
 2 OG0000006 g11758.t1 ""    ""                ""            ""               
 3 OG0000006 g12212.t1 ""    ""                ""            ""               
 4 OG0000006 g12251.t1 ""    ""                ""            ""               
 5 OG0000006 g12781.t1 ""    ""                ""            ""               
 6 OG0000006 g13547.t1 ""    ""                ""            ""               
 7 OG0000006 g1616.t1  ""    ""                ""            ""               
 8 OG0000006 g2495.t1  ""    ""                ""            ""               
 9 OG0000006 g3400.t1  ""    ""                ""            ""               
10 OG0000006 g3400.t2  ""    ""                ""            ""               
# ℹ 574 more rows
# ℹ 4 more variables: Tcastaneum <chr>, Tcas_GeneFunction <chr>, Soryzae <chr>,
#   Sory_GeneFunction <chr>

CAFE5全体での結果は以下の通り

### useIQTREE/Base_clade_results.txt 

#Taxon_ID       Increase        Decrease
Agra<8> 2793    925
<7>     1       53
<5>     1       12
Cass<1> 499     3639
Dpon<0> 2982    1374
Smad<4> 886     3180
<6>     861     573
Sory<3> 2191    1232
Tcas<2> 1853    1984
### useASTRAL/Base_clade_results.txt

#Taxon_ID       Increase        Decrease
Cass<0> 592     3547
Agra<9> 3187    914
<8>     27      1188
<7>     710     754
Sory<5> 2319    1113
Tcas<4> 1923    1958
<6>     1       194
Dpon<3> 3181    1428
Smad<1> 1013    3123

牧野研wiki用のパスワード生成

:~$ echo -n '(ここにパスワードを入れる)' | shasum -a 256

scorpion内にmambaforge/EDTA環境を立ち上げる

dendezia@scorpion:~/pyenv_conda_environment$ pyenv install mambaforge-22.9.0-3
Downloading Mambaforge-22.9.0-3-Linux-x86_64.sh.sh...
-> https://github.com/conda-forge/miniforge/releases/download/22.9.0-3/Mambaforge-22.9.0-3-Linux-x86_64.sh
Installing Mambaforge-22.9.0-3-Linux-x86_64.sh...
Collecting package metadata (current_repodata.json): done
Solving environment: done


==> WARNING: A newer version of conda exists. <==
  current version: 22.9.0
  latest version: 24.9.1

Please update conda by running

    $ conda update -n base -c conda-forge conda



## Package Plan ##

  environment location: /home/dendezia/.pyenv/versions/mambaforge-22.9.0-3

  added / updated specs:
    - conda=22.9.0
    - pip


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2024.8.30  |       hbcca054_0         155 KB  conda-forge
    certifi-2024.8.30          |     pyhd8ed1ab_0         160 KB  conda-forge
    pip-24.2                   |     pyh8b19718_1         1.2 MB  conda-forge
    ------------------------------------------------------------
                                           Total:         1.5 MB

The following packages will be UPDATED:

  ca-certificates                      2022.12.7-ha878542_0 --> 2024.8.30-hbcca054_0 None
  certifi                            2022.12.7-pyhd8ed1ab_0 --> 2024.8.30-pyhd8ed1ab_0 None
  pip                                   22.3.1-pyhd8ed1ab_0 --> 24.2-pyh8b19718_1 None



Downloading and Extracting Packages
ca-certificates-2024 | 155 KB    | ############################################################################################################################################ | 100% 
pip-24.2             | 1.2 MB    | ############################################################################################################################################ | 100% 
certifi-2024.8.30    | 160 KB    | ############################################################################################################################################ | 100% 
Preparing transaction: done
Verifying transaction: done
Executing transaction: done
Retrieving notices: ...working... done
Installed Mambaforge-22.9.0-3-Linux-x86_64.sh to /home/dendezia/.pyenv/versions/mambaforge-22.9.0-3
dendezia@scorpion:~/pyenv_conda_environment$ 

これでmambaforgeのインストールは完了。

次に~/tool/pyenv_envEDTA_profileを作成する。EDTA_profileの中身は以下の通り。

### EDTA_profileの中身

source ~/.bash_profile
source ~/pyenv_conda_environment/.pyenv_profile
pyenv global mambaforge-22.9.0-3



# >>> conda initialize >>>
# !! Contents within this block are managed by 'conda init' !!
__conda_setup="$('/home/dendezia/.pyenv/versions/mambaforge-22.9.0-3/bin/conda' 'shell.bash' 'hook' 2> /dev/null)"
if [ $? -eq 0 ]; then
    eval "$__conda_setup"
else
    if [ -f "/home/dendezia/.pyenv/versions/mambaforge-22.9.0-3//etc/profile.d/conda.sh" ]; then
        . "/home/dendezia/.pyenv/versions/mambaforge-22.9.0-3/etc/profile.d/conda.sh"
    else
        export PATH="/home/dendezia/.pyenv/versions/mambaforge-22.9.0-3/bin:$PATH"
    fi
fi
unset __conda_setup
if [ -f "/home/dendezia/.pyenv/versions/mambaforge-22.9.0-3/etc/profile.d/mamba.sh" ]; then
    . "/home/dendezia/.pyenv/versions/mambaforge-22.9.0-3/etc/profile.d/mamba.sh"
fi
# <<< conda initialize <<<

conda activate EDTA2

これをそのままsourceするとmambaforgeの環境には入れるが、EDTAの環境が無いためbaseに入る。

その状態で下のコードを実行する。

(base) dendezia@scorpion:~/tool/pyenv_env$ git clone https://github.com/oushujun/EDTA.git
Cloning into 'EDTA'...
remote: Enumerating objects: 4879, done.
remote: Counting objects: 100% (626/626), done.
remote: Compressing objects: 100% (168/168), done.
remote: Total 4879 (delta 479), reused 580 (delta 455), pack-reused 4253 (from 1)
Receiving objects: 100% (4879/4879), 232.57 MiB | 17.86 MiB/s, done.
Resolving deltas: 100% (2769/2769), done.
Updating files: 100% (222/222), done.
(base) dendezia@scorpion:~/tool/pyenv_env$ ls
EDTA  EDTA_profile  braker_profile

EDTAのgitにはmambaのイメージが置いてあるので、それを利用する。

EDTAのディレクトリに入り、以下のコマンドを実行する。

mamba env create -f EDTA_2.2.x.yml

これによりEDTA2というmambaの環境が立ち上がる。あとはEDTA_profilesourceすればEDTAの環境に入れる。

(base) dendezia@scorpion:~/tool/pyenv_env$ source EDTA_profile 
(EDTA2) dendezia@scorpion:~/tool/pyenv_env$ 

scorpionでのEcanのソフトマスク

scorpionにECanのゲノムデータを転送。

:~/Downloads$ scp  Ekam_ncbi_dataset.zip dendezia@scorpion:/home/dendezia/tool/for_softmask/nama_data
Host key fingerprint is SHA256:KPa37JYErRVG/1YWy31gMOwAs13hHzUeg3opGD75qVY
+--[ED25519 256]--+
|       .+. .=o=+.|
|        o*.o.=.*+|
|       oo.*oo B.o|
|      ..o= +.* ..|
|    o .+S o * .  |
|   . o. .  E     |
|      ....o      |
|       oo+       |
|       o=        |
+----[SHA256]-----+
Ekam_ncbi_dataset.zip                                                                                                                                100%  274MB  98.6MB/s   00:02    
:~/Downloads$ 
(EDTA2) dendezia@scorpion:~/tool/for_softmask/nama_data$ ls
Ekam_ncbi_dataset.zip
(EDTA2) dendezia@scorpion:~/tool/for_softmask/nama_data$ unzip Ekam_ncbi_dataset.zip 
Archive:  Ekam_ncbi_dataset.zip
  inflating: README.md               
  inflating: ncbi_dataset/data/data_summary.tsv  
  inflating: ncbi_dataset/data/assembly_data_report.jsonl  
  inflating: ncbi_dataset/data/GCA_014849505.1/genomic.gbff  
  inflating: ncbi_dataset/data/GCA_014849505.1/GCA_014849505.1_AAL_Ekam_1.0_genomic.fna  
  inflating: ncbi_dataset/data/GCA_014849505.1/sequence_report.jsonl  
  inflating: ncbi_dataset/data/dataset_catalog.json  
  inflating: md5sum.txt              
(EDTA2) dendezia@scorpion:~/tool/for_softmask/nama_data$ ls
Ekam_ncbi_dataset.zip  README.md  md5sum.txt  ncbi_dataset
(EDTA2) dendezia@scorpion:~/tool/for_softmask/nama_data$ ls ncbi_dataset/
data
(EDTA2) dendezia@scorpion:~/tool/for_softmask/nama_data$ ls data
ls: 'data' にアクセスできません: そのようなファイルやディレクトリはありません
(EDTA2) dendezia@scorpion:~/tool/for_softmask/nama_data$ ls ncbi_dataset/data/
GCA_014849505.1  assembly_data_report.jsonl  data_summary.tsv  dataset_catalog.json
(EDTA2) dendezia@scorpion:~/tool/for_softmask/nama_data$ ls ncbi_dataset/data/GCA_014849505.1/
GCA_014849505.1_AAL_Ekam_1.0_genomic.fna  genomic.gbff  sequence_report.jsonl
(EDTA2) dendezia@scorpion:~/tool/for_softmask/nama_data$ mv ncbi_dataset/ Ekam_dataset
(EDTA2) dendezia@scorpion:~/tool/for_softmask/nama_data$ ls
Ekam_dataset  Ekam_ncbi_dataset.zip  README.md  md5sum.txt
(EDTA2) dendezia@scorpion:~/tool/for_softmask/nama_data$ mv Ekam_ncbi_dataset.zip Ekam_dataset/
(EDTA2) dendezia@scorpion:~/tool/for_softmask/nama_data$ ls
Ekam_dataset  README.md  md5sum.txt
(EDTA2) dendezia@scorpion:~/tool/for_softmask/nama_data$ mv README.md Ekam_dataset/
(EDTA2) dendezia@scorpion:~/tool/for_softmask/nama_data$ mv md5sum.txt Ekam_dataset/
(EDTA2) dendezia@scorpion:~/tool/for_softmask/nama_data$ ls
Ekam_dataset
(EDTA2) dendezia@scorpion:~/tool/for_softmask/nama_data$ 

まずEkamのデータベースを作成する。

(EDTA2) dendezia@scorpion:~/tool/for_softmask/Ekam_softmask$ ls ../nama_data/Ekam_dataset/data/GCA_014849505.1/
GCA_014849505.1_AAL_Ekam_1.0_genomic.fna  genomic.gbff  sequence_report.jsonl
(EDTA2) dendezia@scorpion:~/tool/for_softmask/Ekam_softmask$ BuildDatabase -name Ekam_BLAST_DATABASE ../nama_data/Ekam_dataset/data/GCA_014849505.1/GCA_014849505.1_AAL_Ekam_1.0_genomic.fna
Building database Ekam_BLAST_DATABASE:
  Reading ../nama_data/Ekam_dataset/data/GCA_014849505.1/GCA_014849505.1_AAL_Ekam_1.0_genomic.fna...
Number of sequences (bp) added to database: 364527 ( 269635327 bp )
(EDTA2) dendezia@scorpion:~/tool/for_softmask/Ekam_softmask$ ls
Ekam_BLAST_DATABASE.nhr  Ekam_BLAST_DATABASE.njs  Ekam_BLAST_DATABASE.nni  Ekam_BLAST_DATABASE.nsq
Ekam_BLAST_DATABASE.nin  Ekam_BLAST_DATABASE.nnd  Ekam_BLAST_DATABASE.nog  Ekam_BLAST_DATABASE.translation
(EDTA2) dendezia@scorpion:~/tool/for_softmask/Ekam_softmask$ 

EkamRepeatModeler

~/tool/for_softmask/Ekam_softmaskで以下のスクリプトを書き、qsubで実行した。

### Ekam_RepeatModeler.shの中身

#$ -S /bin/bash
#$ -cwd

echo start at
date

source /home/dendezia/tool/pyenv_env/EDTA_profile

RepeatModeler -database /home/dendezia/tool/for_softmask/Ekam_softmask/Ekam_BLAST_DATABASE -pa 6
date

絶対パスじゃないとエラーが出るので注意!

IQTREEとASTRALの出力ファイルの先頭にツール名を付ける



import pandas as pd

# ファイルの読み込み
file_path = "Base_change.tab"
df = pd.read_csv(file_path, sep="\t")

# FamilyID列に「ASTRAL_」を追加
df['FamilyID'] = "ASTRAL_" + df['FamilyID'].astype(str)

# 結果のファイルを保存
output_file = "Base_change_with_astral.tab"
df.to_csv(output_file, sep="\t", index=False)

print(f"変換されたファイルが {output_file} に保存されました。")

1008

:~/bio/for_cafe$ scp -r kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_cafe ~/bio/for_cafe/241008_original_data
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
|  ..o.o...*   o+ |
|   . . ..= + o* o|
|       .  = oB +.|
|      +.oo .+E+o.|
|      .*S.  o.o.+|
|      .o. .  . .+|
|      ..   +  . o|
|        . ..oo . |
|         . .=.   |
+----[SHA256]-----+
addtreetool.py                                                                                                                                          100%  423    22.8KB/s   00:00    
Base_clade_results.txt                                                                                                                                  100%  168     6.8KB/s   00:00    
Base_asr.tre                                                                                                                                            100% 1702KB  10.2MB/s   00:00    
Base_count.tab                                                                                                                                          100%  281KB   3.4MB/s   00:00    
Base_change_with_astral.tab                                                                                                                             100%  356KB   7.0MB/s   00:00    
Base_results.txt                                                                                                                                        100%  162     5.4KB/s   00:00    
Base_family_likelihoods.txt                                                                                                                             100%  165KB   2.1MB/s   00:00    
Base_family_results.txt                                                                                                                                 100%  157KB   3.1MB/s   00:00    
Base_branch_probabilities.tab                                                                                                                           100%   73KB   1.4MB/s   00:00    
Base_change.tab                                                                                                                                         100%  377KB   7.6MB/s   00:00    
tree_ASTRAL_ultrametric.nwk                                                                                                                             100%  162     5.9KB/s   00:00    
Orthogroups.GeneCount2.tsv                                                                                                                              100%  400KB   3.6MB/s   00:00    
Base_clade_results.txt                                                                                                                                  100%  163     5.8KB/s   00:00    
Base_asr.tre                                                                                                                                            100% 1422KB   8.0MB/s   00:00    
Base_count.tab                                                                                                                                          100%  245KB   3.2MB/s   00:00    
Base_results.txt                                                                                                                                        100%  163     5.5KB/s   00:00    
Base_family_likelihoods.txt                                                                                                                             100%  154KB   2.1MB/s   00:00    
Base_family_results.txt                                                                                                                                 100%  146KB   1.9MB/s   00:00    
Base_branch_probabilities.tab                                                                                                                           100%   73KB   2.1MB/s   00:00    
Base_change.tab                                                                                                                                         100%  327KB   4.0MB/s   00:00    
Base_clade_results.txt                                                                                                                                  100%  247     8.5KB/s   00:00    
Base_asr.tre                                                                                                                                            100% 1485KB   9.3MB/s   00:00    
Base_count.tab                                                                                                                                          100%  252KB   3.2MB/s   00:00    
Base_results.txt                                                                                                                                        100%  161     5.2KB/s   00:00    
yuui.py                                                                                                                                                 100% 1861    59.5KB/s   00:00    
subete_yuui.py                                                                                                                                          100% 2358    37.1KB/s   00:00    
Base_family_likelihoods.txt                                                                                                                             100%  149KB   2.9MB/s   00:00    
Base_family_results.txt                                                                                                                                 100%  140KB   1.9MB/s   00:00    
Base_branch_probabilities.tab                                                                                                                           100%   60KB 971.9KB/s   00:00    
Base_change.tab                                                                                                                                         100%  338KB   3.0MB/s   00:00    
Tcas_yuui.txt                                                                                                                                           100% 1395    47.0KB/s   00:00    
old_tree_ultrametric.nwk                                                                                                                                100%  178     5.8KB/s   00:00    
tree_ultrametric.nwk                                                                                                                                    100%  143     4.5KB/s   00:00    
Orthogroups.GeneCount2.tsv                                                                                                                              100%  400KB   4.9MB/s   00:00    
addtreetool.py                                                                                                                                          100%  423    22.9KB/s   00:00    
Base_clade_results.txt                                                                                                                                  100%  163     5.0KB/s   00:00    
Base_asr.tre                                                                                                                                            100% 1422KB   8.8MB/s   00:00    
Base_count.tab                                                                                                                                          100%  245KB   3.0MB/s   00:00    
Base_results.txt                                                                                                                                        100%  163     6.3KB/s   00:00    
Base_family_likelihoods.txt                                                                                                                             100%  154KB   2.2MB/s   00:00    
Base_family_results.txt                                                                                                                                 100%  146KB   2.6MB/s   00:00    
Base_change_with_IQTREE.tab                                                                                                                             100%  315KB   6.3MB/s   00:00    
Base_branch_probabilities.tab                                                                                                                           100%   72KB   2.0MB/s   00:00    
Base_change.tab                                                                                                                                         100%  327KB   6.5MB/s   00:00    
tree_IQTREE_ultrametric.nwk                                                                                                                             100%  143     2.6KB/s   00:00    
:~/bio/for_cafe$ ls
0930_orthofinder_data              Deg                                Rplot01.png                        caferesult_6sp_iqtree.png          out_madara_SP.txt
241007_cafe_original_data          ManualPhylo_1.py                   Rplot02.png                        cafe後処理.R                       tree_IQTREE_ultrametric.nwk
241008_original_data               ManualPhylo_2.py                   ThroughoutCAFE.R                   cleaned_orthogroups.tsv            tree_ultrametric.nwk
ASTRAL_6sp_after_root_outgroup.txt ManualPhylo_3.py                   branch_site_lrt_results.txt        for_cafe.Rproj
CAFE_plus_gene.csv                 Original_data                      bs_positive_gene.csv               for_sinkagakkai.png
DEG_CAFE_adult_vs_larva.csv        Processed_data                     caferesult.R                       ogfil.py
DEG_CAFE_ovary_vs_body.csv         Rplot.png                          caferesult_6sp.png                 old_result
:~/bio/for_cafe$ ls 241008_original_data/
6sp_useASTRAL                6sp_useIQTREE                madara_4weevil_Tcas_cafetest
:~/bio/for_cafe$ 

ASTRALを使ったPAMLの続き、尤度比検定

kosukesano@at138:~/tools/for_paml/ASTRAL_6sp/bsA$ grep "lnL" result/OG00*_maffted_fixed_branch_alt 
result/OG0008033_maffted_fixed_branch_alt:lnL(ntime: 10  np: 15):  -6780.075815      +0.000000
result/OG0008036_maffted_fixed_branch_alt:lnL(ntime: 10  np: 15):  -3759.435049      +0.000000
result/OG0008044_maffted_fixed_branch_alt:lnL(ntime: 10  np: 15):  -3513.322057      +0.000000
result/OG0008046_maffted_fixed_branch_alt:lnL(ntime: 10  np: 15):  -2111.234962      +0.000000
result/OG0008048_maffted_fixed_branch_alt:lnL(ntime: 10  np: 15):  -6391.383350      +0.000000
result/OG0008055_maffted_fixed_branch_alt:lnL(ntime: 10  np: 15):  -7082.109637      +0.000000
result/OG0008058_maffted_fixed_branch_alt:lnL(ntime: 10  np: 15):  -6357.858837      +0.000000
result/OG0008060_maffted_fixed_branch_alt:lnL(ntime: 10  np: 15):  -8190.191741      +0.000000
result/OG0008065_maffted_fixed_branch_alt:lnL(ntime: 10  np: 15):  -3881.207146      +0.000000
result/OG0008070_maffted_fixed_branch_alt:lnL(ntime: 10  np: 15):  -8519.310868      +0.000000
result/OG0008071_maffted_fixed_branch_alt:lnL(ntime: 10  np: 15): -11407.320280      +0.000000
result/OG0008075_maffted_fixed_branch_alt:lnL(ntime: 10  np: 15):  -7685.699268      +0.000000
result/OG0008095_maffted_fixed_branch_alt:lnL(ntime: 10  np: 15):  -5111.973555      +0.000000
result/OG0008097_maffted_fixed_branch_alt:lnL(ntime: 10  np: 15):  -2546.272979      +0.000000
result/OG0008099_maffted_fixed_branch_alt:lnL(ntime: 10  np: 15):  -7363.406172      +0.000000
result/OG0008101_maffted_fixed_branch_alt:lnL(ntime: 10  np: 15): -46936.546613      +0.000000
result/OG0008106_maffted_fixed_branch_alt:lnL(ntime: 10  np: 15):  -8907.560611      +0.000000
result/OG0008110_maffted_fixed_branch_alt:lnL(ntime: 10  np: 15): -11486.635559      +0.000000
.
.
.
.
.
kosukesano@at138:~/tools/for_paml/ASTRAL_6sp/bs_null$ grep "lnL" result/OG00*_maffted_fixed_branch_alt_null
result/OG0008033_maffted_fixed_branch_alt_null:lnL(ntime: 10  np: 14):  -6780.105180      +0.000000
result/OG0008036_maffted_fixed_branch_alt_null:lnL(ntime: 10  np: 14):  -3759.435049      +0.000000
result/OG0008044_maffted_fixed_branch_alt_null:lnL(ntime: 10  np: 14):  -3513.334051      +0.000000
result/OG0008046_maffted_fixed_branch_alt_null:lnL(ntime: 10  np: 14):  -2111.234961      +0.000000
result/OG0008048_maffted_fixed_branch_alt_null:lnL(ntime: 10  np: 14):  -6392.377523      +0.000000
result/OG0008055_maffted_fixed_branch_alt_null:lnL(ntime: 10  np: 14):  -7082.372261      +0.000000
result/OG0008058_maffted_fixed_branch_alt_null:lnL(ntime: 10  np: 14):  -6357.858837      +0.000000
result/OG0008060_maffted_fixed_branch_alt_null:lnL(ntime: 10  np: 14):  -8190.325473      +0.000000
result/OG0008065_maffted_fixed_branch_alt_null:lnL(ntime: 10  np: 14):  -3881.207146      +0.000000
result/OG0008070_maffted_fixed_branch_alt_null:lnL(ntime: 10  np: 14):  -8519.419180      +0.000000
result/OG0008071_maffted_fixed_branch_alt_null:lnL(ntime: 10  np: 14): -11407.055286      +0.000000
result/OG0008075_maffted_fixed_branch_alt_null:lnL(ntime: 10  np: 14):  -7685.699268      +0.000000
result/OG0008095_maffted_fixed_branch_alt_null:lnL(ntime: 10  np: 14):  -5111.973555      +0.000000
result/OG0008097_maffted_fixed_branch_alt_null:lnL(ntime: 10  np: 14):  -2546.802003      +0.000000
result/OG0008099_maffted_fixed_branch_alt_null:lnL(ntime: 10  np: 14):  -7363.406172      +0.000000
result/OG0008101_maffted_fixed_branch_alt_null:lnL(ntime: 10  np: 14): -46936.546616      +0.000000
.
.
.
.
.

取れてないOGもあるっぽいけど、基本はちゃんとできてそう。

尤度比検定用のPythonスクリプトbs_lrp.pyを作成、実行した。

kosukesano@at138:~/tools/for_paml/ASTRAL_6sp$ nano bs_lrp.py
kosukesano@at138:~/tools/for_paml/ASTRAL_6sp$ python bs_lrp.py 
kosukesano@at138:~/tools/for_paml/ASTRAL_6sp$ ls
branch_site_lrt_results.txt  bsA  bs_lrp.py  bs_null  data  rst  rst1  rub  run_paml.sh.e26903588  run_paml.sh.o26903588
kosukesano@at138:~/tools/for_paml/ASTRAL_6sp$ less branch_site_lrt_results.txt 
kosukesano@at138:~/tools/for_paml/ASTRAL_6sp$ mv branch_site_lrt_results.txt ASTRAL_branch_site_lrt_results.txt 
kosukesano@at138:~/tools/for_paml/ASTRAL_6sp$ ls
ASTRAL_branch_site_lrt_results.txt  bsA  bs_lrp.py  bs_null  data  rst  rst1  rub  run_paml.sh.e26903588  run_paml.sh.o26903588
kosukesano@at138:~/tools/for_paml/ASTRAL_6sp$ 

bs_lrp.pyの中身は以下の通り。

### bs_lrp.pyの中身

###~/tools/for_paml/6sp/bs_lrp.pyの中身

import os
import re
from scipy.stats import chi2

def parse_lnL(file_path):
    with open(file_path, 'r') as f:
        for line in f:
            match = re.search(r'lnL\(ntime: \d+  np: (\d+)\):\s+(-?\d+\.\d+)', line)
            if match:
                np = int(match.group(1))
                lnL = float(match.group(2))
                return np, lnL
    return None, None

def perform_lrt(alt_lnL, alt_np, null_lnL, null_np):
    lr_stat = 2 * (alt_lnL - null_lnL)
    df = alt_np - null_np
    p_val = chi2.sf(lr_stat, df)
    return p_val

def main():
    alt_dir = '~/tools/for_paml/ASTRAL_6sp/bsA/result'
    null_dir = '~/tools/for_paml/ASTRAL_6sp/bs_null/result'
    output_file = 'branch_site_lrt_results.txt'

    alt_dir = os.path.expanduser(alt_dir)
    null_dir = os.path.expanduser(null_dir)

    og_files = [f for f in os.listdir(alt_dir) if '_maffted_fixed_branch_alt' in f]

    with open(output_file, 'w') as out_f:
        out_f.write('OG_num\tp_val\tpositive_selection\n')

        for og_file in og_files:
            og_num = og_file.split('_')[0]
            alt_file = os.path.join(alt_dir, og_file)
            null_file = os.path.join(null_dir, og_file.replace('_maffted_fixed_branch_alt', '_maffted_fixed_branch_alt_null'))

            if os.path.exists(null_file):
                alt_np, alt_lnL = parse_lnL(alt_file)
                null_np, null_lnL = parse_lnL(null_file)

                if alt_np is not None and null_np is not None:
                    p_val = perform_lrt(alt_lnL, alt_np, null_lnL, null_np)
                    reject_null = '+' if p_val < 0.05 else '-'
                    out_f.write(f'{og_num}\t{p_val}\t{reject_null}\n')

if __name__ == "__main__":
    main()

これをローカルに送った。ローカルでは~/bio/for_paml/241008を作りそこに格納。

:~/bio/for_paml$ mkdir 241008
:~/bio/for_paml$ scp kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano//tools/for_paml/ASTRAL_6sp/ASTRAL_branch_site_lrt_results.txt 241008
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
|  ..o.o...*   o+ |
|   . . ..= + o* o|
|       .  = oB +.|
|      +.oo .+E+o.|
|      .*S.  o.o.+|
|      .o. .  . .+|
|      ..   +  . o|
|        . ..oo . |
|         . .=.   |
+----[SHA256]-----+
ASTRAL_branch_site_lrt_results.txt                                                                                                                            100% 7551   218.5KB/s   00:00    
:~/bio/for_paml$ ls 241008/
ASTRAL_branch_site_lrt_results.txt

また、FDRの検定を行った。


:~/bio/for_paml$ source paml_hosei/bin/activate
(paml_hosei) :~/bio/for_paml$ xd 241008/
bash: xd: command not found
(paml_hosei) :~/bio/for_paml$ ls
241008                            branch_site_lrt_results.txt       hosei.py                          hosei_branch_site_lrt_results.txt paml_hosei
(paml_hosei) :~/bio/for_paml$ cd 241008/
(paml_hosei) :~/bio/for_paml/241008$ ls
ASTRAL_branch_site_lrt_results.txt hosei.py
(paml_hosei) :~/bio/for_paml/241008$ python hosei.py 
補正後の結果がhosei_ASTRAL_branch_site_lrt_results.txtに保存されました。

検定後のデータは以下の通り。

FDR=read.csv("/Users/kosukesano/bio/for_paml/241008/hosei_ASTRAL_branch_site_lrt_results.txt", sep="\t")|>
  dplyr::filter(significant == "True") 

orthogroups_file <- "/Users/kosukesano/bio/for_cafe/0930_orthofinder_data/Orthogroups.tsv"

# Orthogroups.tsvの読み込み
orthogroups <- ### OG番号とそれに対応するマダラ遺伝子IDのファイル
  read.delim(orthogroups_file, header=FALSE, sep="\t", 
             #stringsAsFactors=FALSE,
             #col.names = "Data"
             skip=1
  )|>
  dplyr::select("V1", "V5")

FDR_2=dplyr::left_join(FDR, orthogroups, by = c(OG_num = "V1"))|>
  rename(gene_ID = V5)|>
  dplyr::mutate(gene_ID = stringr::str_replace(gene_ID, "^Smad_", "")) 

fa<-read.csv("/Users/kosukesano/bio/functional_annotation/merged_with_gene_function.csv", sep=",")


FDR_3=dplyr::left_join(FDR_2, fa, by = c(gene_ID = "Madara"))|>
  print()
     OG_num        p_val positive_selection        q_val significant   gene_ID
1 OG0009606 1.504944e-04                  + 1.635372e-02        True g12267.t1
2 OG0008249 5.519799e-04                  + 4.498636e-02        True  g9681.t1
3 OG0008782 3.281172e-18                  + 1.069662e-15        True  g9945.t1
4 OG0008142 3.323308e-05                  + 5.416991e-03        True  g4236.t1
  Ecoli Ecol_GeneFunction       Dmelanogaster
1                         Dmel_NP_001163062.1
2                            Dmel_NP_476803.1
3                            Dmel_NP_476617.1
4                            Dmel_NP_610687.1
                        Dmel_GeneFunction          Tcastaneum
1 heparan sulfate C5-epimerase, isoform B Tcas_XP_015839037.1
2                                   scute Tcas_NP_001034533.1
3                               laminin A Tcas_XP_008190900.1
4                                  enigma Tcas_XP_008190394.1
                                                 Tcas_GeneFunction
1                             PREDICTED: D-glucuronyl C5-epimerase
2                                                           asense
3                                 PREDICTED: laminin subunit alpha
4 PREDICTED: acyl-CoA dehydrogenase family member 9, mitochondrial
              Soryzae                                     Sory_GeneFunction
1 Sory_XP_030766642.1                           D-glucuronyl C5-epimerase B
2 Sory_XP_030750625.1                  uncharacterized protein LOC115878311
3 Sory_XP_030765960.1                                 laminin subunit alpha
4 Sory_XP_030752597.1 acyl-CoA dehydrogenase family member 9, mitochondrial

IQ-TREEを使ったPAMLの続き、尤度比検定

bs_nullの方の出力ファイルが_maffted_fixed_branch_altだけだったので、末尾に_nullを追加するスクリプトplus_null.pyを作成、実行した。

### plus_null.pyの中身

import os

# ファイルが格納されているディレクトリのパス
directory = '~/tools/for_paml/IQTREE_6sp/bs_null/result'

# 実際のパスに変換
directory = os.path.expanduser(directory)

# ディレクトリ内のファイルを一括で変更
for filename in os.listdir(directory):
    if '_maffted_fixed_branch_alt' in filename:
        # 新しいファイル名を作成
        new_filename = filename.replace('_maffted_fixed_branch_alt', '_maffted_fixed_branch_alt_null')
        
        # ファイル名の変更
        os.rename(os.path.join(directory, filename), os.path.join(directory, new_filename))

print("ファイル名の変更が完了しました。")

これをやったのち、bs_lrp.pyを作成して実行。

なぜか出力ファイルに書き込まれない

データがSCOではない?

ちゃんとSCOが取れていない。

### 6sp/data/SCO_plusname/OG0010059_maffted_fixed.fastaの中身
>Cass   
atggagaacttagcaaagccccaaataatttgccacaatcaaaaatccttagattacgct
attcacgacgtcaaatggattccttgctccgcaaaatttgtagctataggaggcaaatct
aacggtgcaggtattgtggaaacttatcagctatctgcagatggcatagaaaaactagac
gaattttgcaaaaaggatcacttcaaatgttgcacttttgaagcgtcgagtttgaggaac
aggcatttggcgactggagatttttcgggacgattacaactctgggacctagaagacact
ctgacaccagtttacaaaaccacagtgcacactgctgtaatcaattcaatagatggagtg
gcaggccaaagcgctaactgtggagctccagaaattgttactggttcccgcgacggttgt
gtaatggtatgggatgtgcgccaaaaagacattccagtggcaaaattcactcctttagaa
ggccaagcaggcagagactgttggtgcgtggcttttggaaattcctacaacgacactgaa
aggatagtagctgcaggatatgataatggagacgttaaattgtttgacttgaaaactatg
agcgtacgatggacgaaatgccttaaaaatgggattgtcgatttgcaatttgatcgcaaa
gatataccgatgaacaaactggtggccaccacgttggaatctaaatttttctgtttcgat
gtacgcactcaacatccaaaaaaaggctttgcgcatttaatagaaaatgcgcatgcatct
acaatttggcaagtaaagcatctgccgcaaaatcgagaaatctttatgactaccggaggc
ggtggatctttgtgtttatggaaatatacatatccaccaaaaagagtagagaaagactct
gaaggtatccaatatggaataatgggtgaattacaccaaatacaaaacagtggactttct
gatcaaccgataacggcttttgattggtgcgtggataaattgggccttgcagtgtgttca
gcttacgatcagactttaagagttctgataacgaccaaactgaatttatgctaa
>Smad   
atggaatctttagcgaaaccccaaataatttgtcacaatcagaaatcattagattatgcg
attcacgatgtgaaatggataccttgttctgcgaaatttatttctgtagggggaaaatca
aacggagcgggcatagtagaaatttattcgatatccggggaaggagtggaaaaactggac
gaattttgcaaaaaggatcattttaaatgctgcacattcgatgcttctagcttaaggaat
cggcatttagctactggggacttttcaggacgattgcaactttgggatttggaagacact
ataatgcctgtttataaaactacgactcacactgctgttattaactcaatagacggggta
gcggggcaaagcgccaactgtggagcgcctgaaatagtgacaggttctcgtgatggttgt
gtgatggtttgggacgtgagacagaaggacattccggtagcgaaattcacccccctcgaa
gggcaaagtggacgagattgttggtgcgtagcctttggaaattcttacaacaacgaagag
agggtagtagctgcaggatacgataacggggatgttaaaattttcgatctaaaaaccatg
agcgttcgatggacaaagtgtctaaaaaacggggtggtaaatcttcaattcgaccgaaaa
gacattcccatgaacaaactagtggtgaccaccctggaatcgaaatttttctgcttcgac
gtccgcactcaacatcccaaaaaaggattcgcccacctttccgaaaccgcacacgcctct
acgatatggcaagtgaaacacttgcctcagaacagagaaattttcatgacgaccggtggt
agtgggtctttgtgtttatggaagtacaattacccaatcaaaagggttgaaaaagattct
gaaggaattccatatggaatcataggtgacgtacaacaactccaaaacagtgccctgtct
gaacaacccatcactgcttttgactggtgtgttgacaaactaggtctagctgtgtgctca
gcatatgaccaaaccttgagagttttaataactactaaattgaacttatattag

改めてOGのCDSを取るため、~/tools/for_paml/data/241008_SCOディレクトリを作りExOG.pyを書いた。

### ExOG.pyの中身

# ファイルパスの設定
orthogroups_file_path = '/home/kosukesano/tools/for_orthofinder/RemakeHedder_6sp/OrthoFinder/Results_Sep19/Orthogroups/Orthogroups.txt'
single_copy_orthologues_file_path = '/home/kosukesano/tools/for_orthofinder/RemakeHedder_6sp/OrthoFinder/Results_Sep19/Orthogroups/Orthogroups_SingleCopyOrthologues.txt'
output_file_path = '/home/kosukesano/tools/for_paml/data/241008_SCO/extracted_orthogroups.txt'

# シングルコピーオルソログのIDをセットに格納
single_copy_orthologues = set()
with open(single_copy_orthologues_file_path, 'r') as single_copy_file:
    for line in single_copy_file:
        single_copy_orthologues.add(line.strip())

# Orthogroups.txt から該当する行を抽出して新しいファイルに保存
with open(orthogroups_file_path, 'r') as orthogroups_file, open(output_file_path, 'w') as output_file:
    for line in orthogroups_file:
        # 行の最初の部分を取り出してIDをチェック
        og_id = line.split(':')[0].strip()
        if og_id in single_copy_orthologues:
            output_file.write(line)

これを実行すると、extracted_orthogroups.txtができる。

OG0008033: Agra_P_050292688.1 Cass_AG9767834.1 Dpon_P_019769583.1 Smad_g5339.t1 Sory_P_030760502.1 Tcas_P_001812254.1
OG0008034: Agra_P_050292700.1 Cass_AG9761214.1 Dpon_P_019755574.2 Smad_g6358.t1 Sory_P_030761209.1 Tcas_P_008195282.1
OG0008035: Agra_P_050292731.1 Cass_AH1135743.1 Dpon_P_048519923.1 Smad_g2098.t1 Sory_P_030765758.1 Tcas_P_008196870.1
OG0008036: Agra_P_050292732.1 Cass_AG9767756.1 Dpon_P_019773495.1 Smad_g5269.t1 Sory_P_030765067.1 Tcas_P_015836383.1
OG0008037: Agra_P_050292739.1 Cass_AG9768060.1 Dpon_P_019769194.2 Smad_g11904.t1 Sory_P_030755089.1 Tcas_P_969265.1
OG0008039: Agra_P_050292743.1 Cass_AG9767942.1 Dpon_P_019767966.1 Smad_g4980.t1 Sory_P_030750408.1 Tcas_P_971491.1
OG0008040: Agra_P_050292768.1 Cass_AH1123990.1 Dpon_P_048523285.1 Smad_g10276.t1 Sory_P_030759374.1 Tcas_P_975603.1
OG0008041: Agra_P_050292798.1 Cass_AG9770235.1 Dpon_P_019769671.2 Smad_g12750.t1 Sory_P_030747529.1 Tcas_P_971970.1
OG0008042: Agra_P_050292813.1 Cass_AG9770251.1 Dpon_P_019769634.2 Smad_g5261.t1 Sory_P_030747567.1 Tcas_P_968688.1
OG0008043: Agra_P_050292817.1 Cass_AG9770190.1 Dpon_P_019769690.1 Smad_g5262.t1 Sory_P_030747568.1 Tcas_P_968766.1
OG0008044: Agra_P_050292828.1 Cass_AG9770237.1 Dpon_P_019769698.1 Smad_g7152.t1 Sory_P_030747658.1 Tcas_P_008190584.1
OG0008045: Agra_P_050292879.1 Cass_AG9762270.1 Dpon_P_019773117.1 Smad_g12600.t1 Sory_P_030759522.1 Tcas_P_972888.1
OG0008046: Agra_P_050292889.1 Cass_AG9762382.1 Dpon_P_019753344.1 Smad_g8693.t1 Sory_P_030760073.1 Tcas_P_008195985.1
kosukesano@at138:~/tools/for_paml/data$ mv /home/kosukesano/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_CDS_dir/*fasta 6sp_nama_data/
kosukesano@at138:~/tools/for_paml/data$ ls 6sp_nama_data/
Agra.fasta  Agra_mt.fasta  Cass.fasta  Dmel_mt.fasta  Dpon.fasta  Dpon_mt.fasta  Smad.fasta  Sory.fasta  Sory_mt.fasta  Tcas.fasta  Tcas_mt.fasta  query.fasta
kosukesano@at138:~/tools/for_paml/data$ cd 6sp_nama_data/
kosukesano@at138:~/tools/for_paml/data/6sp_nama_data$ ls
Agra.fasta  Agra_mt.fasta  Cass.fasta  Dmel_mt.fasta  Dpon.fasta  Dpon_mt.fasta  Smad.fasta  Sory.fasta  Sory_mt.fasta  Tcas.fasta  Tcas_mt.fasta  query.fasta
kosukesano@at138:~/tools/for_paml/data/6sp_nama_data$ rm *_mt.fasta
kosukesano@at138:~/tools/for_paml/data/6sp_nama_data$ ls
Agra.fasta  Cass.fasta  Dpon.fasta  Smad.fasta  Sory.fasta  Tcas.fasta  query.fasta
kosukesano@at138:~/tools/for_paml/data/6sp_nama_data$ less query.fasta 
kosukesano@at138:~/tools/for_paml/data/6sp_nama_data$ rm query.fasta 
kosukesano@at138:~/tools/for_paml/data/6sp_nama_data$

間違えて.fastaファイルをrmしちゃったので、もう一度もとのファイルからコピーしてきた。

kosukesano@at138:~/tools/for_paml/data/6sp_nama_data$ python edit.py 
../6sp_nama_data/Tcas.fasta に保存しました。
../6sp_nama_data/Agra.fasta に保存しました。
../6sp_nama_data/Smad.fasta に保存しました。
../6sp_nama_data/Cass.fasta に保存しました。
../6sp_nama_data/Dpon.fasta に保存しました。
../6sp_nama_data/Sory.fasta に保存しました。
kosukesano@at138:~/tools/for_paml/data/6sp_nama_data$ ls
Agra.fasta  Cass.fasta  Dpon.fasta  Smad.fasta  Sory.fasta  Tcas.fasta  edit.py
kosukesano@at138:~/tools/for_paml/data/6sp_nama_data$ less Agra.fasta 
kosukesano@at138:~/tools/for_paml/data/6sp_nama_data$ less Cass.fasta 
kosukesano@at138:~/tools/for_paml/data/6sp_nama_data$ less Dpon.fasta 
kosukesano@at138:~/tools/for_paml/data/6sp_nama_data$ less Smad.fasta 
kosukesano@at138:~/tools/for_paml/data/6sp_nama_data$ rm *.fasta
kosukesano@at138:~/tools/for_paml/data/6sp_nama_data$ ls /home/kosukesano/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_CDS_dir/*fasta
ls: cannot access '/home/kosukesano/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_CDS_dir/*fasta': No such file or directory
kosukesano@at138:~/tools/for_paml/data/6sp_nama_data$ ls /home/kosukesano/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_CDS_dir/
Agra.fasta_blast_results.txt     Cass.fasta_db.nin                Dpon.fasta_db.nsq                Smad.fasta_db.nto                Tcas.fasta_db.nhr
Agra.fasta_db.ndb                Cass.fasta_db.njs                Dpon.fasta_db.ntf                Sory.fasta_blast_results.txt     Tcas.fasta_db.nin
Agra.fasta_db.nhr                Cass.fasta_db.not                Dpon.fasta_db.nto                Sory.fasta_db.ndb                Tcas.fasta_db.njs
Agra.fasta_db.nin                Cass.fasta_db.nsq                Dpon_mt.fasta_blast_results.txt  Sory.fasta_db.nhr                Tcas.fasta_db.not
Agra.fasta_db.njs                Cass.fasta_db.ntf                Dpon_mt.fasta_db.ndb             Sory.fasta_db.nin                Tcas.fasta_db.nsq
Agra.fasta_db.not                Cass.fasta_db.nto                Dpon_mt.fasta_db.nhr             Sory.fasta_db.njs                Tcas.fasta_db.ntf
Agra.fasta_db.nsq                Dmel_mt.fasta_blast_results.txt  Dpon_mt.fasta_db.nin             Sory.fasta_db.not                Tcas.fasta_db.nto
Agra.fasta_db.ntf                Dmel_mt.fasta_db.ndb             Dpon_mt.fasta_db.njs             Sory.fasta_db.nsq                Tcas_mt.fasta_blast_results.txt
Agra.fasta_db.nto                Dmel_mt.fasta_db.nhr             Dpon_mt.fasta_db.not             Sory.fasta_db.ntf                Tcas_mt.fasta_db.ndb
Agra_mt.fasta_blast_results.txt  Dmel_mt.fasta_db.nin             Dpon_mt.fasta_db.nsq             Sory.fasta_db.nto                Tcas_mt.fasta_db.nhr
Agra_mt.fasta_db.ndb             Dmel_mt.fasta_db.njs             Dpon_mt.fasta_db.ntf             Sory_mt.fasta_blast_results.txt  Tcas_mt.fasta_db.nin
Agra_mt.fasta_db.nhr             Dmel_mt.fasta_db.not             Dpon_mt.fasta_db.nto             Sory_mt.fasta_db.ndb             Tcas_mt.fasta_db.njs
Agra_mt.fasta_db.nin             Dmel_mt.fasta_db.nsq             OrthoFinder                      Sory_mt.fasta_db.nhr             Tcas_mt.fasta_db.not
Agra_mt.fasta_db.njs             Dmel_mt.fasta_db.ntf             Smad.fasta_blast_results.txt     Sory_mt.fasta_db.nin             Tcas_mt.fasta_db.nsq
Agra_mt.fasta_db.not             Dmel_mt.fasta_db.nto             Smad.fasta_db.ndb                Sory_mt.fasta_db.njs             Tcas_mt.fasta_db.ntf
Agra_mt.fasta_db.nsq             Dpon.fasta_blast_results.txt     Smad.fasta_db.nhr                Sory_mt.fasta_db.not             Tcas_mt.fasta_db.nto
Agra_mt.fasta_db.ntf             Dpon.fasta_db.ndb                Smad.fasta_db.nin                Sory_mt.fasta_db.nsq             co1blast.sh
Agra_mt.fasta_db.nto             Dpon.fasta_db.nhr                Smad.fasta_db.njs                Sory_mt.fasta_db.ntf
Cass.fasta_blast_results.txt     Dpon.fasta_db.nin                Smad.fasta_db.not                Sory_mt.fasta_db.nto
Cass.fasta_db.ndb                Dpon.fasta_db.njs                Smad.fasta_db.nsq                Tcas.fasta_blast_results.txt
Cass.fasta_db.nhr                Dpon.fasta_db.not                Smad.fasta_db.ntf                Tcas.fasta_db.ndb
kosukesano@at138:~/tools/for_paml/data/6sp_nama_data$ cd ~/tools/
kosukesano@at138:~/tools$ ls
AUGUSTUS_CONFIG_copy  Arthropoda.fa        EDTA_git_install         ProtHint_git_install  braker_git_install  for_MAFFT   for_brakertest  for_orthofinder  for_softmask
All_AUGUSTUS_test     DIAMOND_git_install  GeneMarkETP_git_install  TSEBRA_git_install    for_ASTRAL          for_braker  for_cafe        for_paml         pyenv_env
kosukesano@at138:~/tools$ cd ~/tools/for_braker
kosukesano@at138:~/tools/for_braker$ ls
Femo  Femo_pilon  Kohuki  Kohuki_thread_one  Madara  OnlyProtein_femo  OnlyProtein_madara  nama_data
kosukesano@at138:~/tools/for_braker$ cd Madara/
kosukesano@at138:~/tools/for_braker/Madara$ ls
BUSCO_OUTPUT_Madara_WITHRNA  madara_braker.sh            madara_braker.sh.o26149250   madara_braker.sh.pe26149256  madara_braker.zip          madara_busco.sh.o26170184
braker                       madara_braker.sh.e26149250  madara_braker.sh.o26149256   madara_braker.sh.po26149250  madara_busco.sh            madara_busco.sh.pe26170184
busco_downloads              madara_braker.sh.e26149256  madara_braker.sh.pe26149250  madara_braker.sh.po26149256  madara_busco.sh.e26170184  madara_busco.sh.po26170184
kosukesano@at138:~/tools/for_braker/Madara$ cd braker/
kosukesano@at138:~/tools/for_braker/Madara/braker$ ls
Augustus      braker.aa         braker.codingseq.zip  braker.log              errors             hintsfile.gff  t1tyusyutu.py
GeneMark-ETP  braker.codingseq  braker.gtf            braker_t1_sequences.aa  genome_header.map  species        what-to-cite.txt
kosukesano@at138:~/tools/for_braker/Madara/braker$ cp braker.codingseq ~/tools/for_paml/data/6sp_nama_data/Smad.fasta
kosukesano@at138:~/tools/for_braker/Madara/braker$ less ~/tools/for_paml/data/6sp_nama_data/Smad.fasta
kosukesano@at138:~/tools/for_braker/Madara/braker$ cd
kosukesano@at138:~$ ls
Desktop               local               mafft_plusname.sh.e26313496  mafft_plusname.sh.o26313501  manualphilo.sh.o26271286  manualphylo.sh.o26819185       reference_sequence  rst1
bsAtest.sh.e26312004  mafft.sh.e26293911  mafft_plusname.sh.e26313501  manualphilo.sh.e26271286     manualphilo.sh.o26837716  old_envilonment_until20240430  results_sh_eando    rub
bsAtest.sh.o26312004  mafft.sh.o26293911  mafft_plusname.sh.o26313496  manualphilo.sh.e26837716     manualphylo.sh.e26819185  pyenv_conda_environment        rst                 tools
kosukesano@at138:~$ cd reference_sequence/
kosukesano@at138:~/reference_sequence$ ls
Ecoli                    merge_rbh.py                        rbh.sh            rbh.sh.e26240603  rbh.sh.o26240603   rbh.sh.pe26240603  rbh.sh.po26240603
Madara                   merged_best_hits.txt                rbh.sh.e26231590  rbh.sh.o26231590  rbh.sh.pe26231590  rbh.sh.po26231590  rbh_result.txt
Sory_Tcas_Dmel_Ecol_ref  merged_best_hits_with_function.txt  rbh.sh.e26231593  rbh.sh.o26231593  rbh.sh.pe26231593  rbh.sh.po26231593  reciprocal_best_hits_Dmel.txt
addfunction_test.py      new_rbh.py                          rbh.sh.e26231600  rbh.sh.o26231600  rbh.sh.pe26231600  rbh.sh.po26231600  reciprocal_best_hits_Ecol.txt
addproduct_test.py       out_Dmel_blastp_RefAsMadara.txt     rbh.sh.e26231603  rbh.sh.o26231603  rbh.sh.pe26231603  rbh.sh.po26231603  reciprocal_best_hits_Sory.txt
addproduct_test2.py      out_Ecol_blastp_RefAsMadara.txt     rbh.sh.e26231610  rbh.sh.o26231610  rbh.sh.pe26231610  rbh.sh.po26231610  reciprocal_best_hits_Tcas.txt
blastp_4sp_test.sh       out_Sory_blastp_RefAsMadara.txt     rbh.sh.e26237227  rbh.sh.o26237227  rbh.sh.pe26237227  rbh.sh.po26237227  reciprocal_best_hits_madara.txt
blastp_RefAsMadara.sh    out_Tcas_blastp_RefAsMadara.txt     rbh.sh.e26237813  rbh.sh.o26237813  rbh.sh.pe26237813  rbh.sh.po26237813
functional_annotation    out_madara_as_ref_blastp_.txt       rbh.sh.e26237904  rbh.sh.o26237904  rbh.sh.pe26237904  rbh.sh.po26237904
gene_function.txt        out_madara_blastp_test.txt          rbh.sh.e26238740  rbh.sh.o26238740  rbh.sh.pe26238740  rbh.sh.po26238740
makedic_test.py          rbh.py                              rbh.sh.e26238754  rbh.sh.o26238754  rbh.sh.pe26238754  rbh.sh.po26238754
kosukesano@at138:~/reference_sequence$ cd ../old_envilonment_until20240430/
kosukesano@at138:~/old_envilonment_until20240430$ ls
EDTA  GeMoMa_temp  busco_downloads  cafetest  gall  leaf_beetle  other_weevil  outgroup  paml_test  ronbun_sp
kosukesano@at138:~/old_envilonment_until20240430$ cd other_weevil/
kosukesano@at138:~/old_envilonment_until20240430/other_weevil$ ls
Anthonomus_grandis_grandis  Ceutorhynchus_assimilis  Cylas_formicarius  Nicrophorus_vespilloides  Soryzae
kosukesano@at138:~/old_envilonment_until20240430/other_weevil$ cd Anthonomus_grandis_grandis/
kosukesano@at138:~/old_envilonment_until20240430/other_weevil/Anthonomus_grandis_grandis$ ls
 README.md  'download?include_annotation_type=GENOME_FASTA,GENOME_GFF,RNA_FASTA,CDS_FASTA,PROT_FASTA,SEQUENCE_REPORT'   ncbi_dataset
kosukesano@at138:~/old_envilonment_until20240430/other_weevil/Anthonomus_grandis_grandis$ cd ncbi_dataset/data/GCF_022605725.1/
kosukesano@at138:~/old_envilonment_until20240430/other_weevil/Anthonomus_grandis_grandis/ncbi_dataset/data/GCF_022605725.1$ ls
Anthonomus_buscotest.sh            Anthonomus_buscotest.sh.o25642658   Anthonomus_buscotest.sh.po25642658        busco_downloads  cds_from_genomic.fna  protein.faa  sequence_report.jsonl
Anthonomus_buscotest.sh.e25642658  Anthonomus_buscotest.sh.pe25642658  GCF_022605725.1_icAntGran1.3_genomic.fna  busco_out        genomic.gff           rna.fna
kosukesano@at138:~/old_envilonment_until20240430/other_weevil/Anthonomus_grandis_grandis/ncbi_dataset/data/GCF_022605725.1$ cp cds_from_genomic.fna ~/tools/for_paml/data/6sp_nama_data/Agra.fasta
kosukesano@at138:~/old_envilonment_until20240430/other_weevil/Anthonomus_grandis_grandis/ncbi_dataset/data/GCF_022605725.1$ less ~/tools/for_paml/data/6sp_nama_data/Agra.fasta
kosukesano@at138:~/old_envilonment_until20240430/other_weevil/Anthonomus_grandis_grandis/ncbi_dataset/data/GCF_022605725.1$ cd ../../../../Ceutorhynchus_assimilis/ncbi_dataset/data/GCA_917834065.1/
kosukesano@at138:~/old_envilonment_until20240430/other_weevil/Ceutorhynchus_assimilis/ncbi_dataset/data/GCA_917834065.1$ ls
Ceutorhynchus_buscotest.sh            Ceutorhynchus_buscotest.sh.o25642655   Ceutorhynchus_buscotest.sh.po25642655      busco_downloads  cds_from_genomic.fna  protein.faa
Ceutorhynchus_buscotest.sh.e25642655  Ceutorhynchus_buscotest.sh.pe25642655  GCA_917834065.1_PGI_CEUTPL_v4_genomic.fna  busco_out        genomic.gff           sequence_report.jsonl
kosukesano@at138:~/old_envilonment_until20240430/other_weevil/Ceutorhynchus_assimilis/ncbi_dataset/data/GCA_917834065.1$ cp cds_from_genomic.fna ~/tools/for_paml/data/6sp_nama_data/Cass.fasta
kosukesano@at138:~/old_envilonment_until20240430/other_weevil/Ceutorhynchus_assimilis/ncbi_dataset/data/GCA_917834065.1$ cd ../../../../Soryzae/ncbi_dataset/data/
kosukesano@at138:~/old_envilonment_until20240430/other_weevil/Soryzae/ncbi_dataset/data$ ls
GCA_002938485.2  GCF_002938485.1  assembly_data_report.jsonl  data_summary.tsv  dataset_catalog.json
kosukesano@at138:~/old_envilonment_until20240430/other_weevil/Soryzae/ncbi_dataset/data$ cd GCF_002938485.1/
kosukesano@at138:~/old_envilonment_until20240430/other_weevil/Soryzae/ncbi_dataset/data/GCF_002938485.1$ ls
GCF_002938485.1_Soryzae_2.0_genomic.fna  Soryzae_busco.sh.e26203344  Soryzae_busco.sh.pe26203344  busco_downloads  cds_from_genomic.fna  genomic.gff  protein.faa  sequence_report.jsonl
Soryzae_busco.sh                         Soryzae_busco.sh.o26203344  Soryzae_busco.sh.po26203344  busco_out        genomic.gbff          genomic.gtf  rna.fna
kosukesano@at138:~/old_envilonment_until20240430/other_weevil/Soryzae/ncbi_dataset/data/GCF_002938485.1$ cp cds_from_genomic.fna ~/tools/for_paml/data/6sp_nama_data/Sory.fasta
kosukesano@at138:~/old_envilonment_until20240430/other_weevil/Soryzae/ncbi_dataset/data/GCF_002938485.1$ cd ../../../../
kosukesano@at138:~/old_envilonment_until20240430/other_weevil$ cd ../
kosukesano@at138:~/old_envilonment_until20240430$ ls
EDTA  GeMoMa_temp  busco_downloads  cafetest  gall  leaf_beetle  other_weevil  outgroup  paml_test  ronbun_sp
kosukesano@at138:~/old_envilonment_until20240430$ ls outgroup/
Drosophila_melanogaster  Tribolium_castaneum
kosukesano@at138:~/old_envilonment_until20240430$ ls ronbun_sp/
Dendroctonus_ponderosae  Drosophila_melanogaster  Orthotest  Rhynchophorus_ferrugineus  Tribolium_castaneum  cds_matome  pep_matome  test  three_sp_cds_matome
kosukesano@at138:~/old_envilonment_until20240430$ cd ronbun_sp/
kosukesano@at138:~/old_envilonment_until20240430/ronbun_sp$ ls
Dendroctonus_ponderosae  Drosophila_melanogaster  Orthotest  Rhynchophorus_ferrugineus  Tribolium_castaneum  cds_matome  pep_matome  test  three_sp_cds_matome
kosukesano@at138:~/old_envilonment_until20240430/ronbun_sp$ cd Dendroctonus_ponderosae/ncbi_dataset/data/GCF_020466585.1/
kosukesano@at138:~/old_envilonment_until20240430/ronbun_sp/Dendroctonus_ponderosae/ncbi_dataset/data/GCF_020466585.1$ cp cds_from_genomic.fna ~/tools/for_paml/data/6sp_nama_data/Dpon.fasta
kosukesano@at138:~/old_envilonment_until20240430/ronbun_sp/Dendroctonus_ponderosae/ncbi_dataset/data/GCF_020466585.1$ cd ../../../../../outgroup/Tribolium_castaneum/ncbi_dataset/data/GCF_000002335.3/
kosukesano@at138:~/old_envilonment_until20240430/outgroup/Tribolium_castaneum/ncbi_dataset/data/GCF_000002335.3$ cp cds_from_genomic.fna ~/tools/for_paml/data/6sp_nama_data/Tcas.fasta
kosukesano@at138:~/old_envilonment_until20240430/outgroup/Tribolium_castaneum/ncbi_dataset/data/GCF_000002335.3$ cd ~/tools/for_paml/data/6sp_nama_data/
kosukesano@at138:~/tools/for_paml/data/6sp_nama_data$ ls
Agra.fasta  Cass.fasta  Dpon.fasta  Smad.fasta  Sory.fasta  Tcas.fasta  edit.py
kosukesano@at138:~/tools/for_paml/data/6sp_nama_data$ 

CDS配列ファイルのヘッダーだけproteinにするスクリプト

### ch_hed.py

from Bio import SeqIO

# ファイルパスの設定
paml_fasta = "/home/kosukesano/tools/for_paml/data/6sp_nama_data/Agra.fasta"
orthofinder_fasta = "/home/kosukesano/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/Agra.fasta"
output_fasta = "/home/kosukesano/tools/for_paml/data/6sp_nama_data/changehedder/Agra_changehedder.fasta"

# orthofinderのファイルからprotein_IDをキーにした辞書を作成
orthofinder_dict = {}
for record in SeqIO.parse(orthofinder_fasta, "fasta"):
    protein_id = record.id.split()[0]  # protein_IDは最初のスペースまでの部分
    print(f"Extracted orthofinder_protein_ID: {protein_id}") 
    orthofinder_dict[protein_id] = record

# 出力用リスト
output_records = []

# pamlのファイルを処理
for record in SeqIO.parse(paml_fasta, "fasta"):
    header_parts = record.description.split("protein_id=")
    
    if len(header_parts) > 1:
        protein_id = header_parts[1].split("]")[0]  # protein_idを抽出
        print(f"Extracted CDS_protein_ID: {protein_id}")  # 抽出したprotein_IDを出力

        # orthofinderファイルで一致するprotein_IDがあるか確認
        if protein_id in orthofinder_dict:
            print(f"Match found for protein_ID: {protein_id}")
            # 一致する場合、ヘッダーを置き換え
            new_header = orthofinder_dict[protein_id].description
            record.description = new_header
        else:
            print(f"No match found for protein_ID: {protein_id}")
    else:
        print(f"protein_ID not found in header: {record.description}")

    # 出力リストに追加
    output_records.append(record)

# 新しいファイルに書き出し
SeqIO.write(output_records, output_fasta, "fasta")
print(f"Modified fasta file saved to: {output_fasta}")
### ~/tools/for_paml/data/6sp_nama_data/changehedder/edit.pyの中身

import os
from Bio import SeqIO

# 入力ディレクトリと出力ディレクトリのパス
input_dir = '../changehedder/'
output_dir = '../RemakeHedder_6sp_afterchange/'

# 出力ディレクトリが存在しない場合は作成
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# 入力ディレクトリ内のすべての .fasta ファイルを処理
for input_file in os.listdir(input_dir):
    if input_file.endswith('.fasta'):
        input_path = os.path.join(input_dir, input_file)
        output_path = os.path.join(output_dir, input_file)

        # 入力ファイルを読み込み、条件に基づいて書き換えた内容を出力ファイルに保存
        with open(output_path, 'w') as outfile:
            for record in SeqIO.parse(input_path, 'fasta'):
                header = record.description
                seq = str(record.seq)

                # ヘッダーが「g」で始まる場合
                if header.startswith("g"):
                    # 新しいヘッダーは「>Smad」 + 「元のヘッダーの番号」
                    number = header.split()[0]  # ヘッダーの最初の番号部分を取得
                    new_header = f">Smad_{number}"

                # ヘッダーが「]」で終わる場合
                elif header.endswith("]"):
                    # ヘッダーの最後の「[]」内の英字を抽出
                    within_brackets = header.split('[')[-1].split(']')[0]
                    first_letter = within_brackets[0]  # 最初の1文字
                    space_after = within_brackets.split()[-1][:3]  # スペース後の3文字
                    
                    # 元のヘッダーから最初の「>」の次の文字から最初の「 」までの部分を取得
                    first_part = header.split()[1][1:]
                    new_header = f">{first_letter}{space_after}_{first_part}"

                else:
                    new_header = f">{header.split()[0]}"

                # 新しいヘッダーと配列を出力ファイルに書き込む
                outfile.write(f"{new_header}\n{seq}\n")

        print(f"{output_path} に保存しました。")

1009

CDS取得の続き

昨日のch_hedder.pyを全種分行い、~/tools/for_paml/data/6sp_nama_data/changehedder/に保存した。

kosukesano@at139:~/tools/for_paml/data$ ls 6sp_nama_data/changehedder/
Agra_changehedder.fasta  Dpon_changehedder.fasta  Sory_changehedder.fasta  ch_hed.py  makedf.py
Cass_changehedder.fasta  Smad_changehedder.fasta  Tcas_changehedder.fasta  edit.py    protein_headers.csv

その後、edit.pyを実行し、ファイルを~/tools/for_paml/data/241009_RemakeHedder_6sp_afterchange/に保存した。

kosukesano@at139:~/tools/for_paml/data/6sp_nama_data/changehedder$ python edit.py 
../../241009_RemakeHedder_6sp_afterchange/Sory_changehedder.fasta に保存しました。
../../241009_RemakeHedder_6sp_afterchange/Dpon_changehedder.fasta に保存しました。
../../241009_RemakeHedder_6sp_afterchange/Tcas_changehedder.fasta に保存しました。
../../241009_RemakeHedder_6sp_afterchange/Smad_changehedder.fasta に保存しました。
../../241009_RemakeHedder_6sp_afterchange/Cass_changehedder.fasta に保存しました。
../../241009_RemakeHedder_6sp_afterchange/Agra_changehedder.fasta に保存しました。
kosukesano@at139:~/tools/for_paml/data/6sp_nama_data/changehedder$ cd ../../

上記のファイルを使用し、SCOを抽出する。そのスクリプト~/tools/for_paml/data/241008_SCO/new_makefna.pyは以下の通り。

### new_makefna.pyの中身

# 必要なモジュールをインポート
import os

# ファイルパスの設定
orthogroups_file = "extracted_orthogroups.txt"
input_dir = "../241009_RemakeHedder_6sp_afterchange/"
output_dir = "/home/kosukesano/tools/for_paml/data/CDS_SCO/"

# ディレクトリが存在しない場合、作成
os.makedirs(output_dir, exist_ok=True)

# OG番号と遺伝子IDをextracted_orthogroups.txtから取得
with open(orthogroups_file, "r") as ortho_f:
    for line in ortho_f:
        if line.strip():  # 空行を無視
            # 行をOG番号と遺伝子IDに分割
            og_number, gene_ids_str = line.split(":")
            og_number = og_number.strip()
            gene_ids = gene_ids_str.strip().split()

            # 遺伝子IDを種ごとに分割
            genes = {
                "Agra": gene_ids[0],
                "Cass": gene_ids[1],
                "Dpon": gene_ids[2],
                "Smad": gene_ids[3],
                "Sory": gene_ids[4],
                "Tcas": gene_ids[5]
            }

            # 出力ファイルのパスを設定
            output_file = os.path.join(output_dir, f"{og_number}.fna")

            # 出力ファイルを開く
            with open(output_file, "w") as out_f:
                # 各種ごとに遺伝子IDを取得し、対応するファイルからシーケンスを検索
                for species, gene_id in genes.items():
                    fasta_file = os.path.join(input_dir, f"{species}_changehedder.fasta")

                    with open(fasta_file, "r") as fasta_f:
                        write_flag = False
                        for line in fasta_f:
                            if line.startswith(f">{gene_id}"):
                                # ヘッダー行を見つけたら、出力ファイルに書き込みを開始
                                out_f.write(line)
                                print(line.strip())  # 標準出力にヘッダーを表示
                                write_flag = True
                            elif line.startswith(">") and write_flag:
                                # 次のヘッダー行が見つかったら、現在の遺伝子の書き込みを終了
                                write_flag = False
                            elif write_flag:
                                # シーケンス部分を書き込む
                                out_f.write(line)
                                print(line.strip())  # 標準出力にシーケンスを表示

            print(f"{og_number}.fna ファイルが {output_dir} に保存されました。")

5分くらいで終わる。

次にこれらのCDSをアライメントする。~/tools/for_paml/data/241008_SCOmafft.shを作成、qsubで投げた。

### mafft.sh

#$ -S /bin/bash

source ~/tools/pyenv_env/ManualPhilo_profile

# ディレクトリパス
input_dir="/home/kosukesano/tools/for_paml/data/CDS_SCO/"
output_dir="/home/kosukesano/tools/for_paml/data/CDS_SCO/"

# 各ファイルに対してアラインメントを実行
for file in "$input_dir"*.fna; do
  # 元のファイル名から拡張子を除いたものを取得
  base_name=$(basename "$file" .fna)

  # 出力ファイル名を生成
  output_file="${output_dir}${base_name}_maffted.fna"

  # MAFFTを実行
  mafft --auto --maxiterate 1000 --localpair "$file" > "$output_file"

  echo "Aligned file created: $output_file"
done

こっちは結構時間かかる。

この後、ヘッダーを種名のみにする必要があった。そのためのスクリプトfix.py~/tools/for_paml/data/CDS_SCOに作成。

### fix.pyの中身


import os

# 対象ディレクトリ
input_dir = "/home/kosukesano/tools/for_paml/data/CDS_SCO"

# ディレクトリ内の_maffted.fnaファイルを処理
for filename in os.listdir(input_dir):
    if filename.endswith("_maffted.fna"):
        input_filepath = os.path.join(input_dir, filename)
        output_filename = filename.replace("_maffted.fna", "_maffted_fixed.fna")
        output_filepath = os.path.join(input_dir, output_filename)
        
        with open(input_filepath, 'r') as infile, open(output_filepath, 'w') as outfile:
            for line in infile:
                if line.startswith(">"):
                    # ヘッダーの「>」とその後の4文字に置き換える
                    new_header = ">" + line[1:5] + "\n"
                    outfile.write(new_header)
                else:
                    # 配列行はそのまま書き込む
                    outfile.write(line)

print("ヘッダー置き換え処理が完了しました。")

scorpionでのEkamソフトマスク続き

dendezia@scorpion:~$ ls
RM_671004.MonOct70853342024  old_envilonment_until20241004  pyenv_conda_environment  tool
dendezia@scorpion:~$ ls RM_671004.MonOct70853342024/
consensi.fa  consensi.fa.classified  families-classified.stk  families.stk  round-1  round-2  round-3  round-4  round-5  round-6  tmpConsensi.fa

できてるけど、出力ファイルのRM_671004.MonOct70853342024/がホームディレクトリに行っちゃってる。

mvで移動させた。

dendezia@scorpion:~$ ls
RM_671004.MonOct70853342024  old_envilonment_until20241004  pyenv_conda_environment  tool
dendezia@scorpion:~$ mv RM_671004.MonOct70853342024/ ~/tool/for_softmask/Ekam_softmask
dendezia@scorpion:~$ ls ~/tool/for_softmask/Ekam_softmask
Ekam_BLAST_DATABASE-families.fa   Ekam_BLAST_DATABASE.njs  Ekam_BLAST_DATABASE.nsq          Ekam_RepeatModeler.sh.e2016  Ekam_RepeatModeler.sh.o2016
Ekam_BLAST_DATABASE-families.stk  Ekam_BLAST_DATABASE.nnd  Ekam_BLAST_DATABASE.translation  Ekam_RepeatModeler.sh.e2017  Ekam_RepeatModeler.sh.o2017
Ekam_BLAST_DATABASE.nhr           Ekam_BLAST_DATABASE.nni  Ekam_RepeatModeler.sh            Ekam_RepeatModeler.sh.e2018  Ekam_RepeatModeler.sh.o2018
Ekam_BLAST_DATABASE.nin           Ekam_BLAST_DATABASE.nog  Ekam_RepeatModeler.sh.e2015      Ekam_RepeatModeler.sh.o2015  RM_671004.MonOct70853342024
dendezia@scorpion:~$ 

続いてRepeatMaskerを行う。~/tool/for_softmask/Ekam_softmaskにてEkam_RepeatMasker.shを作成し、qsubで投げた。

### Ekam_RepeatMasker.shの中身

#$ -S /bin/bash
#$ -cwd

echo start at
date

cd /home/dendezia/tool/for_softmask/Ekam_softmask/

source /home/dendezia/tool/pyenv_env/EDTA_profile


RepeatMasker -pa 6 -lib\
        /home/dendezia/tool/for_softmask/Ekam_softmask/RM_671004.MonOct70853342024/consensi.fa.classified\
        /home/dendezia/tool/for_softmask/nama_data/Ekam_dataset/data/GCA_014849505.1/GCA_014849505.1_AAL_Ekam_1.0_genomic.fna
date

結果

dendezia@scorpion:~/tool/for_softmask/Ekam_softmask$ ls ../nama_data/Ekam_dataset/data/GCA_014849505.1/
GCA_014849505.1_AAL_Ekam_1.0_genomic.fna         GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked  GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.tbl  sequence_report.jsonl
GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.cat.gz  GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.out     genomic.gbff
dendezia@scorpion:~/tool/for_softmask/Ekam_softmask$ 

出力ファイルのGCA_014849505.1_AAL_Ekam_1.0_genomic.fna.cat.gzができた!

続いてProcessRepeatsを行う。~/tool/for_softmask/Ekam_softmaskEkam_ProcessRepeats.shを作成した。

### Ekam_ProcessRepeats.shの中身

#$ -S /bin/bash
#$ -cwd

echo start at
date

cd /home/dendezia/tool/for_softmask/Ekam_softmask/

source /home/dendezia/tool/pyenv_env/EDTA_profile

ProcessRepeats\
        -maskSource /home/dendezia/tool/for_softmask/nama_data/Ekam_dataset/data/GCA_014849505.1/GCA_014849505.1_AAL_Ekam_1.0_genomic.fna\
        -xsmall\
        -gff\
        /home/dendezia/tool/for_softmask/nama_data/Ekam_dataset/data/GCA_014849505.1/GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.cat.gz
date

これをqsubで投げた

IQ-TREE系統樹を使ったPAMLやり直し

~/tools/for_paml/241009_IQTREE_6sp下で/bsA/ディレクトリと/bs_null/ディレクトリを作成した。

bsAについて

~/tools/for_paml/241009_IQTREE_6sp/bsAbsA_IQTREE_paml.shtemplate.ctlを作成、bsA_IQTREE_paml.shの方をqsubで投げた。

### ~/tools/for_paml/241009_IQTREE_6sp/bsA/bsA_IQTREE_paml.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l gpu

# ディレクトリの設定
input_dir="/home/kosukesano/tools/for_paml/data/CDS_SCO"
bsA_dir="/home/kosukesano/tools/for_paml/241009_IQTREE_6sp/bsA"
result_dir="$bsA_dir/result"
template_ctl="$bsA_dir/template.ctl"

# 出力ディレクトリが存在しない場合は作成
mkdir -p "$result_dir"

# テンプレートの制御ファイルを読み込む
ctl_template=$(cat "$template_ctl")

# ディレクトリ内の_maffted_fixed.fastaファイルを処理
for file in "$input_dir"/*_maffted.fna; do
  if [[ -f "$file" ]]; then
    base_name=$(basename "$file" .fna)
    outfile_path="$result_dir/${base_name}_branch_alt"

    # 一時的な制御ファイルの内容を生成
    ctl_content="${ctl_template//<SEQFILE>/$file}"
    ctl_content="${ctl_content//<OUTFILE>/$outfile_path}"

    # 一時的な制御ファイルを作成
    ctl_path="$bsA_dir/bsA.ctl"
    echo "$ctl_content" > "$ctl_path"

    # PAMLを実行
    singularity exec -e /usr/local/biotools/p/paml:4.9--h779adbc_6 codeml "$ctl_path"

    echo "Processed file: $file, output: $outfile_path"
  fi
done
### ~/tools/for_paml/241009_IQTREE_6sp/bsA/template.ctlの中身

seqfile = <SEQFILE>
treefile = /home/kosukesano/tools/for_paml/IQTREE_6sp/data/new_tree_IQTREE_ultrametric.nwk
outfile = <OUTFILE>

noisy = 9
verbose = 1
runmode = 0
seqtype = 1
CodonFreq = 2
clock = 0
model = 2
NSsites = 2
fix_omega = 0
omega = 1
icode = 0
fix_kappa = 0
kappa = 2
fix_alpha = 1
alpha = .0
Malpha = 0
ncatG = 4
getSE = 0
RateAncestor = 0
method = 0
fix_blength = 0

bs_nullについて

~/tools/for_paml/241009_IQTREE_6sp/bs_nullbsN_IQTREE_paml.shbsN_template.ctlを作成、bsN_IQTREE_paml.shの方をqsubで投げた。

### ~/tools/for_paml/241009_IQTREE_6sp/bs_null/bsN_IQTREE_paml.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l gpu

# ディレクトリの設定
input_dir="/home/kosukesano/tools/for_paml/data/CDS_SCO"
bsA_dir="/home/kosukesano/tools/for_paml/241009_IQTREE_6sp/bs_null"
result_dir="$bsA_dir/result"
template_ctl="$bsA_dir/bsN_template.ctl"

# 出力ディレクトリが存在しない場合は作成
mkdir -p "$result_dir"

# テンプレートの制御ファイルを読み込む
ctl_template=$(cat "$template_ctl")

# ディレクトリ内の_maffted_fixed.fastaファイルを処理
for file in "$input_dir"/*_maffted.fna; do
  if [[ -f "$file" ]]; then
    base_name=$(basename "$file" .fna)
    outfile_path="$result_dir/${base_name}_branch_alt"

    # 一時的な制御ファイルの内容を生成
    ctl_content="${ctl_template//<SEQFILE>/$file}"
    ctl_content="${ctl_content//<OUTFILE>/$outfile_path}"

    # 一時的な制御ファイルを作成
    ctl_path="$bsA_dir/bsA.ctl"
    echo "$ctl_content" > "$ctl_path"

    # PAMLを実行
    singularity exec -e /usr/local/biotools/p/paml:4.9--h779adbc_6 codeml "$ctl_path"

    echo "Processed file: $file, output: $outfile_path"
  fi
done
### ~/tools/for_paml/241009_IQTREE_6sp/bs_null/bsN_template.ctlの中身

seqfile = <SEQFILE>
treefile = /home/kosukesano/tools/for_paml/IQTREE_6sp/data/new_tree_IQTREE_ultrametric.nwk
outfile = <OUTFILE>

noisy = 9
verbose = 1
runmode = 0
seqtype = 1
CodonFreq = 2
clock = 0
model = 2
NSsites = 2
fix_omega = 1
omega = 1
icode = 0
fix_kappa = 0
kappa = 2
fix_alpha = 1
alpha = .0
Malpha = 0
ncatG = 4
getSE = 0
RateAncestor = 0
method = 0
fix_blength = 0

ASTRAL系統樹を使ったPAMLやり直し

bsAについて

~/tools/for_paml/241009_ASTRAL_6sp/bsAbsA_ASTRAL_paml.shtemplate.ctlを作成、bsA_ASTRAL_paml.shの方をqsubで投げた。

### ~/tools/for_paml/241009_ASTRAL_6sp/bsA/bsA_ASTRAL_paml.shの中身

### run_paml.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l gpu

# ディレクトリの設定
input_dir="/home/kosukesano/tools/for_paml/data/CDS_SCO"
bsA_dir="/home/kosukesano/tools/for_paml/241009_ASTRAL_6sp/bsA"
result_dir="$bsA_dir/result"
template_ctl="$bsA_dir/template.ctl"

# 出力ディレクトリが存在しない場合は作成
mkdir -p "$result_dir"

# テンプレートの制御ファイルを読み込む
ctl_template=$(cat "$template_ctl")

# ディレクトリ内の_maffted_fixed.fastaファイルを処理
for file in "$input_dir"/*_maffted_fixed.fna; do
  if [[ -f "$file" ]]; then
    base_name=$(basename "$file" .fna)
    outfile_path="$result_dir/${base_name}_branch_alt"

    # 一時的な制御ファイルの内容を生成
    ctl_content="${ctl_template//<SEQFILE>/$file}"
    ctl_content="${ctl_content//<OUTFILE>/$outfile_path}"

    # 一時的な制御ファイルを作成
    ctl_path="$bsA_dir/bsA.ctl"
    echo "$ctl_content" > "$ctl_path"

    # PAMLを実行
    singularity exec -e /usr/local/biotools/p/paml:4.9--h779adbc_6 codeml "$ctl_path"

    echo "Processed file: $file, output: $outfile_path"
  fi
done
### ~/tools/for_paml/241009_ASTRAL_6sp/bsA/template.ctlの中身

seqfile = <SEQFILE>
treefile = /home/kosukesano/tools/for_paml/ASTRAL_6sp/data/new_tree_ASTRAL_ultrametric.nwk
outfile = <OUTFILE>

noisy = 9
verbose = 1
runmode = 0
seqtype = 1
CodonFreq = 2
clock = 0
model = 2
NSsites = 2
fix_omega = 0
omega = 1
icode = 0
fix_kappa = 0
kappa = 2
fix_alpha = 1
alpha = .0
Malpha = 0
ncatG = 4
getSE = 0
RateAncestor = 0
method = 0
fix_blength = 0

bs_nullについて

~/tools/for_paml/241009_ASTRAL_6sp/bs_nullbsN_ASTRAL_paml.shbsN_template.ctlを作成、bsN_ASTRAL_paml.shの方をqsubで投げた。

### ~/tools/for_paml/241009_ASTRAL_6sp/bs_null/bsN_ASTRAL_paml.shの中身

### run_paml.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l gpu

# ディレクトリの設定
input_dir="/home/kosukesano/tools/for_paml/data/CDS_SCO"
bsA_dir="/home/kosukesano/tools/for_paml/241009_ASTRAL_6sp/bs_null"
result_dir="$bsA_dir/result"
template_ctl="$bsA_dir/bsN_template.ctl"

# 出力ディレクトリが存在しない場合は作成
mkdir -p "$result_dir"

# テンプレートの制御ファイルを読み込む
ctl_template=$(cat "$template_ctl")

# ディレクトリ内の_maffted_fixed.fastaファイルを処理
for file in "$input_dir"/*_maffted_fixed.fna; do
  if [[ -f "$file" ]]; then
    base_name=$(basename "$file" .fna)
    outfile_path="$result_dir/${base_name}_branch_alt"

    # 一時的な制御ファイルの内容を生成
    ctl_content="${ctl_template//<SEQFILE>/$file}"
    ctl_content="${ctl_content//<OUTFILE>/$outfile_path}"

    # 一時的な制御ファイルを作成
    ctl_path="$bsA_dir/bsA.ctl"
    echo "$ctl_content" > "$ctl_path"

    # PAMLを実行
    singularity exec -e /usr/local/biotools/p/paml:4.9--h779adbc_6 codeml "$ctl_path"

    echo "Processed file: $file, output: $outfile_path"
  fi
done
### ~/tools/for_paml/241009_ASTRAL_6sp/bs_null/bsN_template.ctlの中身

seqfile = <SEQFILE>
treefile = /home/kosukesano/tools/for_paml/ASTRAL_6sp/data/new_tree_ASTRAL_ultrametric.nwk
outfile = <OUTFILE>

noisy = 9
verbose = 1
runmode = 0
seqtype = 1
CodonFreq = 2
clock = 0
model = 2
NSsites = 2
fix_omega = 1
omega = 1
icode = 0
fix_kappa = 0
kappa = 2
fix_alpha = 1
alpha = .0
Malpha = 0
ncatG = 4
getSE = 0
RateAncestor = 0
method = 0
fix_blength = 0

1010

scorpionでのEkamソフトマスク結果

Ekam_ProcessRepeats.shが終わり、~/tool/for_softmask//nama_data/Ekam_dataset/data/GCA_014849505.1/ディレクトリにGCA_014849505.1_AAL_Ekam_1.0_genomic.fna.maskedが出力された。

### GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.maskedの中身の一部

>JACGEL010000001.1 Elaeidobius kamerunicus isolate PL Ekam 1 scaffold-1, whole genome shotgun sequence
taaaacaataaaaatactttattttaatattcaatattgtattaatatat
aacttaattttctctattttaactaattttcaaACCCCTAACATGTTTTC
CAGTGAGccgctaaaaaaatatcacaaaatgaactttaagtttaagttGG
AAATTTAAGACTTGAAGCTAGCTAGGATGAGTCGNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNAAATACATCTTTGATTTGTAAGTATctg
tagtatatttttggaataaaatagtttattaaatatatttcggtTTTCCT
TTTCCCGTAGGACGTTGCAAAGTGGCGACgaggatttttatatttcccta
GAAAAATAGAACCCCCTAGTTGGGAAAATTAGTGGGTTTCTAAAATTCCG
GTAAAGTAAGAAAACGTGTAGTGTAGTGTGCAGATAGAATTTGaccctaa
aataaatgattggACTGTGCACATAAATCGTCTGATGATTCTATAAACAG
ACCAAAAAGAGTAATTTTACTCAATGGGCTTGCTCAAGAACCGTATATAT
TGCTACAAAACTTAAGTCTTTCAACGAAACCTTCTGAAGCTACTTACTAG
GACCTTCTCAAGTACTTTAATAGCTATTTAAAGTTTTCCGATTACAAGGA
CTTCGATGAGGTCGAGGTAGAGAAAACGCTGGCAACCGTGGAGGCCGAGT
GCATTTTGGAGGCGGTACTAGTGACCGGTCGGGCAGCGCAGGAACGAAAA
AATCAAGTGATAGTGATGTGTTCAATCTGTCGAAAAAGTAAACATTCCga
aaacaaatgttttcatCGTAATTTTAACAGGTTTTGCAGCTTCTGCAAAC
TAAAGCACATAATACagtaaactgtaaaaataaaatggacattgaacaaa
ataccaataatGACAATgtgaatgatttaaattttaatataaatttaaat
gaatttccgGTCTATACCACATAACATTTCTAGTcctattgaaatatttt
ttaaataatagtctGTATAATTTTGAACTGATTCAGGTGCAGTACTTTCG
TGTACACCCTATTCGAtgtatgcaaattatttttgagatattttctTGAT
TAAAACTTATGTAACATGATTAATTTGAGTGGTAGAATAATTTCACCAAT
TGGTCAAGTTGTCCTAAAGctggaatataataaacaagtttCGAATTTAA
.
.
.
.
.

ソフトマスクに治ってる部分もあるけど、ハードマスクのままの部分もある。どういうことだ?

ASTRAL系統樹を使ったPAMLやり直しの続き

~/tools/for_paml/241009_ASTRAL_6spディレクトリでbs_lrp.pyを作成し、実行した。

### ~/tools/for_paml/241009_ASTRAL_6sp/bs_lrp.pyの中身

import os
import re
from scipy.stats import chi2

def parse_lnL(file_path):
    with open(file_path, 'r') as f:
        for line in f:
            match = re.search(r'lnL\(ntime: \d+  np: (\d+)\):\s+(-?\d+\.\d+)', line)
            if match:
                np = int(match.group(1))
                lnL = float(match.group(2))
                return np, lnL
    return None, None

def perform_lrt(alt_lnL, alt_np, null_lnL, null_np):
    lr_stat = 2 * (alt_lnL - null_lnL)
    df = alt_np - null_np
    p_val = chi2.sf(lr_stat, df)
    return p_val

def main():
    alt_dir = '/home/kosukesano/tools/for_paml/241009_ASTRAL_6sp/bsA/result'
    null_dir = '/home/kosukesano/tools/for_paml/241009_ASTRAL_6sp/bs_null/result'
    output_file = 'branch_site_lrt_results.txt'

    alt_dir = os.path.expanduser(alt_dir)
    null_dir = os.path.expanduser(null_dir)

    og_files = [f for f in os.listdir(alt_dir) if '_maffted_fixed_branch_alt' in f]

    with open(output_file, 'w') as out_f:
        out_f.write('OG_num\tp_val\tpositive_selection\n')

        for og_file in og_files:
            og_num = og_file.split('_')[0]
            alt_file = os.path.join(alt_dir, og_file)
            null_file = os.path.join(null_dir, og_file)

            if os.path.exists(null_file):
                alt_np, alt_lnL = parse_lnL(alt_file)
                null_np, null_lnL = parse_lnL(null_file)

                if alt_np is not None and null_np is not None:
                    p_val = perform_lrt(alt_lnL, alt_np, null_lnL, null_np)
                    reject_null = '+' if p_val < 0.05 else '-'
                    out_f.write(f'{og_num}\t{p_val}\t{reject_null}\n')

if __name__ == "__main__":
    main()

これを実行したところ、branch_site_lrt_results.txtができた。ローカルに送ってFDRにかける。

:~/bio/for_paml/241010$ scp kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_paml/241009_ASTRAL_6sp/branch_site_lrt_results.txt /Users/kosukesano/bio/for_paml/241010/ASTRAL_branch_site_
lrt_results.txt
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
|  ..o.o...*   o+ |
|   . . ..= + o* o|
|       .  = oB +.|
|      +.oo .+E+o.|
|      .*S.  o.o.+|
|      .o. .  . .+|
|      ..   +  . o|
|        . ..oo . |
|         . .=.   |
+----[SHA256]-----+
branch_site_lrt_results.txt                                                                                                                                   100%   27KB 684.1KB/s   00:00    
:~/bio/for_paml/241010$ 

FDRにかけた結果は以下の通り

AST=read.csv("/Users/kosukesano/bio/for_paml/241010/hosei_ASTRAL_branch_site_lrt_results.txt", sep="\t")|>
  dplyr::filter(significant == "True") 

orthogroups_file <- "/Users/kosukesano/bio/for_cafe/0930_orthofinder_data/Orthogroups.tsv"

# Orthogroups.tsvの読み込み
orthogroups <- ### OG番号とそれに対応するマダラ遺伝子IDのファイル
  read.delim(orthogroups_file, header=FALSE, sep="\t", 
             #stringsAsFactors=FALSE,
             #col.names = "Data"
             skip=1
  )|>
  dplyr::select("V1", "V5")

AST_2=dplyr::left_join(AST, orthogroups, by = c(OG_num = "V1"))|>
  rename(gene_ID = V5)|>
  dplyr::mutate(gene_ID = stringr::str_replace(gene_ID, "^Smad_", "")) 

fa<-read.csv("/Users/kosukesano/bio/functional_annotation/merged_with_gene_function.csv", sep=",")

AST_3=dplyr::left_join(AST_2, fa, by = c(gene_ID = "Madara"))

deg1=read.csv("/Users/kosukesano/bio/for_cafe/Deg/DEG_ovary_vs_body_DESeq2.csv", sep=",")

deg2=read.csv("/Users/kosukesano/bio/for_cafe/Deg/DEG_Adult_vs_Larva_DESeq2.csv", sep=",")

deg_all=dplyr::full_join(deg1, deg2, by = "gene_ID")

AST_4=dplyr::left_join(AST_3, deg_all, by  = "gene_ID")|>###完成系
  dplyr::select(gene_ID, q_val, Sory_GeneFunction, ovary.body_log2FC, ovary.body_adjPval, adult.llarva_log2FC, adult.llarva_adjPval, adult.mlarva_log2FC, adult.mlarva_adjPval) |>
  print()
    gene_ID        q_val
1  g2313.t1 3.310264e-02
2  g3206.t1 1.004678e-13
3 g12267.t1 2.607850e-02
4  g9945.t1 3.980078e-15
5  g4236.t1 8.062344e-03
6 g10787.t1 7.040355e-03
7  g6829.t1 1.627589e-02
8  g8864.t1 9.034552e-04
                                                                Sory_GeneFunction
1                                            protein PTCD3 homolog, mitochondrial
2 LOW QUALITY PROTEIN: cell division cycle and apoptosis regulator protein 1-like
3                                                     D-glucuronyl C5-epimerase B
4                                                           laminin subunit alpha
5                           acyl-CoA dehydrogenase family member 9, mitochondrial
6          probable tRNA N6-adenosine threonylcarbamoyltransferase, mitochondrial
7                                methyltransferase-like protein 17, mitochondrial
8                                                        importin subunit alpha-3
  ovary.body_log2FC ovary.body_adjPval adult.llarva_log2FC adult.llarva_adjPval
1         0.8232494           1.43e-13                  NA                   NA
2                NA                 NA                  NA                   NA
3                NA                 NA                  NA                   NA
4        -1.6162768           9.89e-06                  NA                   NA
5                NA                 NA                  NA                   NA
6         2.5426042           3.48e-17                  NA                   NA
7         1.8062424           4.66e-14                  NA                   NA
8                NA                 NA                  NA                   NA
  adult.mlarva_log2FC adult.mlarva_adjPval
1                  NA                   NA
2                  NA                   NA
3                  NA                   NA
4                  NA                   NA
5                  NA                   NA
6                  NA                   NA
7                  NA                   NA
8                  NA                   NA

8つの遺伝子で正の選択が検出された

IQ-TREE系統樹を使ったPAMLやり直しの続き

~/tools/for_paml/241009_IQTREE_6spディレクトリでbs_lrp.pyを作成し、実行したが、謎のエラーが出て実行できなかった。

おそらくlnL後に空白が多く含まれている事が原因か?

### /home/kosukesano/tools/for_paml/241009_IQTREE_6sp/bs_null/result/OG0008385_maffted_fixed_branch_altの一部

TREE #  1:  (1, ((2, 3), 4), (5, 6));   MP score: -1
lnL(ntime:  9  np: 13):  -5213.435205      +0.000000
   7..1     7..8     8..9     9..2     9..3     8..4     7..10   10..5    10..6  
 1.563575 0.056729 0.383439 0.872471 2.465303 1.081219 0.906124 1.759343 4.388369 2.018860 0.736067 0.164952 0.101980

Note: Branch length is defined as number of nucleotide substitutions per codon (not per neucleotide site).

tree length =  13.47657

これを修正したファイルnew_bs_lrp.pyを作成し、実行した。

### new_lrp.pyの中身

import os
import re
from scipy.stats import chi2

def parse_lnL(file_path):
    try:
        with open(file_path, 'r') as f:
            for line in f:
                print(f"Processing line: {line.strip()}")  # デバッグ用
                match = re.search(r'lnL\(ntime:\s*\d+\s+np:\s*(\d+)\):\s+(-?\d+\.\d+)', line)
                if match:
                    np = int(match.group(1))
                    lnL = float(match.group(2))
                    return np, lnL
        print(f"{file_path} に 'lnL' 行が見つかりませんでした。形式を確認してください。")
        return None, None
    except Exception as e:
        print(f"{file_path} を開く際にエラーが発生しました: {e}")
        return None, None

def perform_lrt(alt_lnL, alt_np, null_lnL, null_np):
    try:
        lr_stat = 2 * (alt_lnL - null_lnL)
        df = alt_np - null_np
        p_val = chi2.sf(lr_stat, df)
        return p_val
    except Exception as e:
        print(f"LRT計算中にエラーが発生しました: {e}")
        return None

def main():
    alt_dir = '/home/kosukesano/tools/for_paml/241009_IQTREE_6sp/bsA/result'
    null_dir = '/home/kosukesano/tools/for_paml/241009_IQTREE_6sp/bs_null/result'
    output_file = 'branch_site_lrt_results.txt'

    alt_dir = os.path.expanduser(alt_dir)
    null_dir = os.path.expanduser(null_dir)

    # 処理するOGファイルリストの取得
    og_files = [f for f in os.listdir(alt_dir) if '_maffted_fixed_branch_alt' in f]

    with open(output_file, 'w') as out_f:
        out_f.write('OG_num\tp_val\tpositive_selection\n')

        # 各OGファイルについてループ処理
        for idx, og_file in enumerate(og_files):
            og_num = og_file.split('_')[0]
            alt_file = os.path.join(alt_dir, og_file)
            null_file = os.path.join(null_dir, og_file)

            print(f"{idx+1}/{len(og_files)}: {og_num} の解析を開始します...")

            if os.path.exists(null_file):
                alt_np, alt_lnL = parse_lnL(alt_file)
                null_np, null_lnL = parse_lnL(null_file)

                if alt_np is not None and null_np is not None:
                    p_val = perform_lrt(alt_lnL, alt_np, null_lnL, null_np)
                    if p_val is not None:
                        reject_null = '+' if p_val < 0.05 else '-'
                        out_f.write(f'{og_num}\t{p_val}\t{reject_null}\n')
                        print(f"{og_num} の解析が完了しました。p値: {p_val}, 正の選択: {reject_null}")
                    else:
                        print(f"{og_num} のLRT計算に失敗しました。")
                else:
                    print(f"{og_num} のlnLデータが不完全です。")
            else:
                print(f"{og_num} の対応するnullモデルファイルが見つかりませんでした。")

if __name__ == "__main__":
    main()

これを実行したところ、branch_site_lrt_results.txtに結果がちゃんと出力された。

これをローカルにコピー。

:~/bio/for_paml/241010$ scp kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_paml/241009_IQTREE_6sp/branch_site_lrt_results.txt /Users/kosukesano/bio/for_paml/241010/IQTREE_branch_site_lrt_results.txt
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
|  ..o.o...*   o+ |
|   . . ..= + o* o|
|       .  = oB +.|
|      +.oo .+E+o.|
|      .*S.  o.o.+|
|      .o. .  . .+|
|      ..   +  . o|
|        . ..oo . |
|         . .=.   |
+----[SHA256]-----+
branch_site_lrt_results.txt                                                                                                                                   100%   27KB 967.8KB/s   00:00    
:~/bio/for_paml/241010$ 

FDR後の結果は以下の通り

IQT=read.csv("/Users/kosukesano/bio/for_paml/241010/hosei_IQTREE_branch_site_lrt_results.txt", sep="\t")|>
  dplyr::filter(significant == "True") 

orthogroups_file <- "/Users/kosukesano/bio/for_cafe/0930_orthofinder_data/Orthogroups.tsv"

# Orthogroups.tsvの読み込み
orthogroups <- ### OG番号とそれに対応するマダラ遺伝子IDのファイル
  read.delim(orthogroups_file, header=FALSE, sep="\t", 
             #stringsAsFactors=FALSE,
             #col.names = "Data"
             skip=1
  )|>
  dplyr::select("V1", "V5")

IQT_2=dplyr::left_join(IQT, orthogroups, by = c(OG_num = "V1"))|>
  rename(gene_ID = V5)|>
  dplyr::mutate(gene_ID = stringr::str_replace(gene_ID, "^Smad_", "")) 

fa<-read.csv("/Users/kosukesano/bio/functional_annotation/merged_with_gene_function.csv", sep=",")


IQT_3=dplyr::left_join(IQT_2, fa, by = c(gene_ID = "Madara"))

deg1=read.csv("/Users/kosukesano/bio/for_cafe/Deg/DEG_ovary_vs_body_DESeq2.csv", sep=",")

deg2=read.csv("/Users/kosukesano/bio/for_cafe/Deg/DEG_Adult_vs_Larva_DESeq2.csv", sep=",")

deg_all=dplyr::full_join(deg1, deg2, by = "gene_ID")

IQT_4=dplyr::left_join(IQT_3, deg_all, by  = "gene_ID")|>###完成系
  dplyr::select(gene_ID, q_val, Sory_GeneFunction, ovary.body_log2FC, ovary.body_adjPval, adult.llarva_log2FC, adult.llarva_adjPval, adult.mlarva_log2FC, adult.mlarva_adjPval) |>
  print()
    gene_ID        q_val
1 g12267.t1 2.458483e-02
2  g9945.t1 2.144997e-14
3 g10111.t1 1.777694e-04
4 g10787.t1 7.414778e-03
5  g7878.t1 1.201570e-12
6  g4328.t1 6.210582e-13
7  g1127.t1 6.411997e-03
8  g6829.t1 6.526610e-03
                                                       Sory_GeneFunction
1                                            D-glucuronyl C5-epimerase B
2                                                  laminin subunit alpha
3                                                        protein cueball
4 probable tRNA N6-adenosine threonylcarbamoyltransferase, mitochondrial
5                                   uncharacterized protein LOC115876326
6                                                               cullin-5
7                                                   ruvB-like helicase 1
8                       methyltransferase-like protein 17, mitochondrial
  ovary.body_log2FC ovary.body_adjPval adult.llarva_log2FC adult.llarva_adjPval
1                NA                 NA                  NA                   NA
2         -1.616277           9.89e-06                  NA                   NA
3                NA                 NA                  NA                   NA
4          2.542604           3.48e-17                  NA                   NA
5         -6.713844           1.54e-42                  NA                   NA
6          1.180746           8.71e-09                  NA                   NA
7          2.001888           1.17e-28                  NA                   NA
8          1.806242           4.66e-14                  NA                   NA
  adult.mlarva_log2FC adult.mlarva_adjPval
1                  NA                   NA
2                  NA                   NA
3                  NA                   NA
4                  NA                   NA
5                  NA                   NA
6                  NA                   NA
7                  NA                   NA
8                  NA                   NA

PAML結果のまとめ

AST_join=AST|>
  dplyr::mutate(ASTRAL = stringr::str_replace(positive_selection, "\\+", "ASTRAL")) 
IQT_join=IQT|>
  dplyr::mutate(IQTREE = stringr::str_replace(positive_selection, "\\+", "IQ-TREE")) 

df=dplyr::full_join(IQT_join, AST_join, by  = "OG_num")
orthogroups_file <- "/Users/kosukesano/bio/for_cafe/0930_orthofinder_data/Orthogroups.tsv"
orthogroups <- ### OG番号とそれに対応するマダラ遺伝子IDのファイル
  read.delim(orthogroups_file, header=FALSE, sep="\t", 
             #stringsAsFactors=FALSE,
             #col.names = "Data"
             skip=1
  )|>
  dplyr::select("V1", "V5")
df2=dplyr::left_join(df, orthogroups, by = c(OG_num = "V1"))|>
  rename(gene_ID = V5)|>
  dplyr::mutate(gene_ID = stringr::str_replace(gene_ID, "^Smad_", ""))
fa<-read.csv("/Users/kosukesano/bio/functional_annotation/merged_with_gene_function.csv", sep=",")
df3=dplyr::left_join(df2, fa, by = c(gene_ID = "Madara"))
deg1=read.csv("/Users/kosukesano/bio/for_cafe/Deg/DEG_ovary_vs_body_DESeq2.csv", sep=",")

deg2=read.csv("/Users/kosukesano/bio/for_cafe/Deg/DEG_Adult_vs_Larva_DESeq2.csv", sep=",")

deg_all=dplyr::full_join(deg1, deg2, by = "gene_ID")
df4=dplyr::left_join(df3, deg_all, by  = "gene_ID")|>###完成系
  tidyr::unite(tree_tool, ASTRAL, IQTREE, sep = "/") |>
  dplyr::select(gene_ID, tree_tool, Sory_GeneFunction, ovary.body_log2FC, ovary.body_adjPval, adult.llarva_log2FC, adult.llarva_adjPval, adult.mlarva_log2FC, adult.mlarva_adjPval) |>
  dplyr::mutate(tree_tool = stringr::str_replace(tree_tool, "\\/NA", ""))|>
  dplyr::mutate(tree_tool = stringr::str_replace(tree_tool, "NA\\/", ""))|>
  print()
     gene_ID      tree_tool
1  g12267.t1 ASTRAL/IQ-TREE
2   g9945.t1 ASTRAL/IQ-TREE
3  g10111.t1        IQ-TREE
4  g10787.t1 ASTRAL/IQ-TREE
5   g7878.t1        IQ-TREE
6   g4328.t1        IQ-TREE
7   g1127.t1        IQ-TREE
8   g6829.t1 ASTRAL/IQ-TREE
9   g2313.t1         ASTRAL
10  g3206.t1         ASTRAL
11  g4236.t1         ASTRAL
12  g8864.t1         ASTRAL
                                                                 Sory_GeneFunction
1                                                      D-glucuronyl C5-epimerase B
2                                                            laminin subunit alpha
3                                                                  protein cueball
4           probable tRNA N6-adenosine threonylcarbamoyltransferase, mitochondrial
5                                             uncharacterized protein LOC115876326
6                                                                         cullin-5
7                                                             ruvB-like helicase 1
8                                 methyltransferase-like protein 17, mitochondrial
9                                             protein PTCD3 homolog, mitochondrial
10 LOW QUALITY PROTEIN: cell division cycle and apoptosis regulator protein 1-like
11                           acyl-CoA dehydrogenase family member 9, mitochondrial
12                                                        importin subunit alpha-3
   ovary.body_log2FC ovary.body_adjPval adult.llarva_log2FC
1                 NA                 NA                  NA
2         -1.6162768           9.89e-06                  NA
3                 NA                 NA                  NA
4          2.5426042           3.48e-17                  NA
5         -6.7138444           1.54e-42                  NA
6          1.1807457           8.71e-09                  NA
7          2.0018883           1.17e-28                  NA
8          1.8062424           4.66e-14                  NA
9          0.8232494           1.43e-13                  NA
10                NA                 NA                  NA
11                NA                 NA                  NA
12                NA                 NA                  NA
   adult.llarva_adjPval adult.mlarva_log2FC adult.mlarva_adjPval
1                    NA                  NA                   NA
2                    NA                  NA                   NA
3                    NA                  NA                   NA
4                    NA                  NA                   NA
5                    NA                  NA                   NA
6                    NA                  NA                   NA
7                    NA                  NA                   NA
8                    NA                  NA                   NA
9                    NA                  NA                   NA
10                   NA                  NA                   NA
11                   NA                  NA                   NA
12                   NA                  NA                   NA

合計で12個の遺伝子に正の選択が見られた

1011

scorpion環境でのBRAKER3インストール

以下のコマンドを実行

conda install -c anaconda perl
conda install -c anaconda biopython
conda install -c bioconda perl-app-cpanminus
conda install -c bioconda perl-file-spec
conda install -c bioconda perl-hash-merge
conda install -c bioconda perl-list-util
conda install -c bioconda perl-module-load-conditional
conda install -c bioconda perl-posix
conda install -c bioconda perl-file-homedir
conda install -c bioconda perl-parallel-forkmanager
conda install -c bioconda perl-scalar-util-numeric
conda install -c bioconda perl-yaml
conda install -c bioconda perl-exception-class
conda install -c bioconda perl-class-data-inheritable
conda install -c bioconda perl-test-pod
conda install -c bioconda perl-file-which
conda install -c bioconda perl-mce
conda install -c bioconda perl-threaded
conda install -c bioconda perl-list-util
conda install -c bioconda perl-math-utils
conda install -c bioconda cdbtools
conda install -c eumetsat perl-yaml-xs
conda install -c bioconda perl-data-dumper
conda install anaconda::gcc_linux-64

perlモジュールのインストール

cpanm Hash::Merge
cpanm List::Util
cpanm MCE::Mutex
cpanm Module::Load::Conditional
cpanm Parallel::Forkcpanm 
cpanm Scalar::Util::Numeric
cpanm YAML
cpanm Math::Utils
cpanm File::HomeDir
cpanm Thread::Queue
(braker) dendezia@scorpion:~/tool$ cpanm File::Spec::Functions
--> Working on File::Spec::Functions
Fetching http://www.cpan.org/authors/id/X/XS/XSAWYERX/PathTools-3.75.tar.gz ... OK
Configuring PathTools-3.75 ... OK
Building and testing PathTools-3.75 ... FAIL
! Installing File::Spec::Functions failed. See /home/dendezia/.cpanm/work/1728622512.1157695/build.log for details. Retry with --force to force install it.
(braker) dendezia@scorpion:~/tool$ cpanm YAML::XS
--> Working on YAML::XS
Fetching http://www.cpan.org/authors/id/T/TI/TINITA/YAML-LibYAML-v0.902.0.tar.gz ... OK
Configuring YAML-LibYAML-v0.902.0 ... OK
Building and testing YAML-LibYAML-v0.902.0 ... FAIL
! Installing YAML::XS failed. See /home/dendezia/.cpanm/work/1728622576.1157970/build.log for details. Retry with --force to force install it.
(braker) dendezia@scorpion:~/tool$ cpanm Data::Dumper
--> Working on Data::Dumper
Fetching http://www.cpan.org/authors/id/N/NW/NWCLARK/Data-Dumper-2.183.tar.gz ... OK
Configuring Data-Dumper-2.183 ... OK
Building and testing Data-Dumper-2.183 ... FAIL
! Installing Data::Dumper failed. See /home/dendezia/.cpanm/work/1728622586.1158296/build.log for details. Retry with --force to force install it.
(braker) dendezia@scorpion:~/tool$ cpanm threads
--> Working on threads
Fetching http://www.cpan.org/authors/id/J/JD/JDHEDDEN/threads-2.21.tar.gz ... OK
Configuring threads-2.21 ... N/A
! Configure failed for threads-2.21. See /home/dendezia/.cpanm/work/1728622605.1158565/build.log for details.
(braker) dendezia@scorpion:~/tool$ 

4つのモジュールでエラー。前の記録を見ると同じエラーが出てるけど放置してるっぽい。

BRAKER本体やその他ツールのインストール

(braker) dendezia@scorpion:~/tool/braker_git_install$ git clone https://github.com/Gaius-Augustus/BRAKER.git
Cloning into 'BRAKER'...
remote: Enumerating objects: 7335, done.
remote: Counting objects: 100% (1677/1677), done.
remote: Compressing objects: 100% (667/667), done.
remote: Total 7335 (delta 1079), reused 1539 (delta 987), pack-reused 5658 (from 1)
Receiving objects: 100% (7335/7335), 123.45 MiB | 24.97 MiB/s, done.
Resolving deltas: 100% (5430/5430), done.
(braker) dendezia@scorpion:~/tool/braker_git_install$ ls
BRAKER
(braker) dendezia@scorpion:~/tool/braker_git_install$ 
(braker) dendezia@scorpion:~/tool/braker_git_install$ git clone https://github.com/gatech-genemark/ProtHint.git
Cloning into 'ProtHint'...
remote: Enumerating objects: 1289, done.
remote: Counting objects: 100% (257/257), done.
remote: Compressing objects: 100% (91/91), done.
remote: Total 1289 (delta 170), reused 249 (delta 166), pack-reused 1032 (from 1)
Receiving objects: 100% (1289/1289), 56.69 MiB | 15.88 MiB/s, done.
Resolving deltas: 100% (812/812), done.
(braker) dendezia@scorpion:~/tool/braker_git_install$ git clone https://github.com/Gaius-Augustus/TSEBRA.git
Cloning into 'TSEBRA'...
remote: Enumerating objects: 1443, done.
remote: Counting objects: 100% (293/293), done.
remote: Compressing objects: 100% (147/147), done.
remote: Total 1443 (delta 179), reused 237 (delta 143), pack-reused 1150 (from 1)
Receiving objects: 100% (1443/1443), 59.02 MiB | 20.51 MiB/s, done.
Resolving deltas: 100% (912/912), done.
(braker) dendezia@scorpion:~/tool/braker_git_install$ git clone https://github.com/gatech-genemark/GeneMark-ETP.git
Cloning into 'GeneMark-ETP'...
remote: Enumerating objects: 482, done.
remote: Counting objects: 100% (46/46), done.
remote: Compressing objects: 100% (37/37), done.
remote: Total 482 (delta 11), reused 31 (delta 7), pack-reused 436 (from 1)
Receiving objects: 100% (482/482), 56.91 MiB | 21.75 MiB/s, done.
Resolving deltas: 100% (230/230), done.
Updating files: 100% (249/249), done.

遺伝研での環境は各ツール別にディレクトリを作っていたが、今回はbraker_git_installに全て集約することにした。

プロテインデータベースのダウンロード

ローカルで実行した。

:~/Downloads$ scp ~/Downloads/Arthropoda.fa.gz dendezia@scorpion:/home/dendezia/tool/braker_git_install
Host key fingerprint is SHA256:KPa37JYErRVG/1YWy31gMOwAs13hHzUeg3opGD75qVY
+--[ED25519 256]--+
|       .+. .=o=+.|
|        o*.o.=.*+|
|       oo.*oo B.o|
|      ..o= +.* ..|
|    o .+S o * .  |
|   . o. .  E     |
|      ....o      |
|       oo+       |
|       o=        |
+----[SHA256]-----+
Arthropoda.fa.gz                                                                                                                                               100% 1219MB  99.7MB/s   00:12    
:~/Downloads$ 

これをscorpionで解凍

(braker) dendezia@scorpion:~/tool/braker_git_install$ gunzip Arthropoda.fa.gz 
(braker) dendezia@scorpion:~/tool/braker_git_install$ ls
Arthropoda.fa  BRAKER  GeneMark-ETP  ProtHint  TSEBRA

遺伝研スパコンでのEkamBRAKER実行

:~/Downloads$ scp dendezia@scorpion:/home/dendezia/tool/for_softmask/nama_data/Ekam_dataset/data/GCA_014849505.1/GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked.gz /Users/kosukesano/bio/
Host key fingerprint is SHA256:KPa37JYErRVG/1YWy31gMOwAs13hHzUeg3opGD75qVY
+--[ED25519 256]--+
|       .+. .=o=+.|
|        o*.o.=.*+|
|       oo.*oo B.o|
|      ..o= +.* ..|
|    o .+S o * .  |
|   . o. .  E     |
|      ....o      |
|       oo+       |
|       o=        |
+----[SHA256]-----+
GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked.gz
:~/Downloads$ scp /Users/kosukesano/bio/GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked.gz kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_braker/nama_data
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
|  ..o.o...*   o+ |
|   . . ..= + o* o|
|       .  = oB +.|
|      +.oo .+E+o.|
|      .*S.  o.o.+|
|      .o. .  . .+|
|      ..   +  . o|
|        . ..oo . |
|         . .=.   |
+----[SHA256]-----+
GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked.gz                                                                                                             100%   92MB 102.2MB/s   00:00    
:~/Downloads$ 

scorpionでソフトマスクしたEkamのゲノムデータを遺伝研に転送

kosukesano@at139:~/tools/for_braker/nama_data$ ls
231117_Madara_softmasked.fasta                      Madara_RNAseq                busco_downloads          femo_busco.sh.pe26221930   kohuki_busco.sh.o26238968   length.txt
BUSCO_OUTPUT_FEMO_GENOME                            Sfem_RNAseq                  femo_busco.sh            femo_busco.sh.po26221930   kohuki_busco.sh.pe26238968  madaralength.txt
BUSCO_OUTPUT_KOHUKI_GENOME                          Sfem_pilon_softmasked.fasta  femo_busco.sh.e26221930  kohuki_busco.sh            kohuki_busco.sh.po26238968
GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked.gz  Sfem_softmasked.fasta        femo_busco.sh.o26221930  kohuki_busco.sh.e26238968  kohuki_softmasked.fasta
kosukesano@at139:~/tools/for_braker/nama_data$ unzip GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked.gz
Archive:  GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked.gz
  inflating: GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked  
kosukesano@at139:~/tools/for_braker/nama_data$ ls
231117_Madara_softmasked.fasta                      Madara_RNAseq                femo_busco.sh             kohuki_busco.sh             kohuki_softmasked.fasta
BUSCO_OUTPUT_FEMO_GENOME                            Sfem_RNAseq                  femo_busco.sh.e26221930   kohuki_busco.sh.e26238968   length.txt
BUSCO_OUTPUT_KOHUKI_GENOME                          Sfem_pilon_softmasked.fasta  femo_busco.sh.o26221930   kohuki_busco.sh.o26238968   madaralength.txt
GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked     Sfem_softmasked.fasta        femo_busco.sh.pe26221930  kohuki_busco.sh.pe26238968
GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked.gz  busco_downloads              femo_busco.sh.po26221930  kohuki_busco.sh.po26238968
kosukesano@at139:~/tools/for_braker/nama_data$ less GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked
kosukesano@at139:~/tools/for_braker/nama_data$ mv GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked Elaeidobius_kamerunicus.masked.fna
kosukesano@at139:~/tools/for_braker/nama_data$ ls
231117_Madara_softmasked.fasta      GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked.gz  Sfem_softmasked.fasta    femo_busco.sh.o26221930   kohuki_busco.sh.e26238968   kohuki_softmasked.fasta
BUSCO_OUTPUT_FEMO_GENOME            Madara_RNAseq                                       busco_downloads          femo_busco.sh.pe26221930  kohuki_busco.sh.o26238968   length.txt
BUSCO_OUTPUT_KOHUKI_GENOME          Sfem_RNAseq                                         femo_busco.sh            femo_busco.sh.po26221930  kohuki_busco.sh.pe26238968  madaralength.txt
Elaeidobius_kamerunicus.masked.fna  Sfem_pilon_softmasked.fasta                         femo_busco.sh.e26221930  kohuki_busco.sh           kohuki_busco.sh.po26238968
kosukesano@at139:~/tools/for_braker/nama_data$ 

Elaeidobius_kamerunicus.masked.fnaにファイル名を変更

~/tools/for_braker/Ekamディレクトリを作成し、BRAKERを実行した。投げたスクリプトは以下の通り。

### Ekam_braker.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 16
echo start at
date

source /home/kosukesano/tools/pyenv_env/braker_profile

braker.pl --genome=/home/kosukesano/tools/for_braker/nama_data/Elaeidobius_kamerunicus.masked.fna\
        --prot_seq=/home/kosukesano/tools/Arthropoda.fa\
        --threads=16\
        --species=Smadaranus_withRNA\
        --AUGUSTUS_CONFIG_PATH=/usr/share/augustus/config\
        --AUGUSTUS_BIN_PATH=/usr/bin\
        --AUGUSTUS_SCRIPTS_PATH=/usr/share/augustus/scripts\
        --GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin\
        --PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin\
        --TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin
date

1013

EkamBRAKER続き

なんか知らんエラーがでとる!

### Ekam_braker.sh.e27018010の中身


#**********************************************************************************
#                               BRAKER CONFIGURATION                               
#**********************************************************************************
# BRAKER CALL: /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl --genome=/home/kosukesano/tools/for_braker/nama_data/Elaeidobius_kamerunicus.masked.fna --prot_seq=/home/kosukesano/tools/Arthropoda.fa --threads=16 --species=Ekamerunicus --AUGUSTUS_CONFIG_PATH=/usr/share/augustus/config --AUGUSTUS_BIN_PATH=/usr/bin --AUGUSTUS_SCRIPTS_PATH=/usr/share/augustus/scripts --GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin --PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin --TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin
# Sun Oct 13 16:19:40 2024: braker.pl version 3.0.8
# Sun Oct 13 16:19:40 2024: Only Protein input detected, BRAKER will be executed in EP mode (BRAKER2).
# Sun Oct 13 16:19:40 2024: Configuring of BRAKER for using external tools...
# Sun Oct 13 16:19:40 2024: Trying to set $AUGUSTUS_CONFIG_PATH...
# Sun Oct 13 16:19:40 2024: Found command line argument $AUGUSTUS_CONFIG_PATH.
# Sun Oct 13 16:19:40 2024: Checking /usr/share/augustus/config as potential path for $AUGUSTUS_CONFIG_PATH.
# Sun Oct 13 16:19:40 2024: Success! Setting $AUGUSTUS_CONFIG_PATH to /usr/share/augustus/config!
# Sun Oct 13 16:19:40 2024: WARNING: in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 1933
AUGUSTUS_CONFIG_PATH/species (in this case /usr/share/augustus/config/species) is not writeable. BRAKER will try to copy the AUGUSTUS config directory to a writeable location.
#*********
# WARNING: Detected whitespace in fasta header of file /home/kosukesano/tools/for_braker/nama_data/Elaeidobius_kamerunicus.masked.fna. This may later on cause problems! The pipeline will create a new file without spaces or "|" characters and a genome_header.map file to look up the old and new headers. This message will be suppressed from now on!
#*********
ERROR in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 5258
Failed to execute: /home/kosukesano/.pyenv/versions/anaconda3-2020.11/envs/braker/bin/perl /home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes/gmes_petap.pl --verbose --cores=16 --ES --gc_donor 0.001 --sequence=/lustre7/home/kosukesano/tools/for_braker/Ekam/braker/genome.fa  --soft_mask auto 1>/lustre7/home/kosukesano/tools/for_braker/Ekam/braker/GeneMark-ES.stdout 2>/lustre7/home/kosukesano/tools/for_braker/Ekam/braker/errors/GeneMark-ES.stderr !

GeneMark-ESでエラーが出てるみたいなので、braker/errors/GeneMark-ES.stderrを確認する。

### GeneMark-ES.stderrの中身

error, file not found: data/training.fna

こんな感じ。

これをググってみたらこちらのページがヒットした。どうもインプットのゲノムがマスクされすぎていて、BRAKERが配列を探せなかったらしい?

そもそも元のデータがすでにソフトマスクされていないか?

### ~/tools/for_softmask/nama_data/Ekam_data/GCA_014849505.1/GCA_014849505.1_AAL_Ekam_1.0_genomic.fnaの中身の一部

>JACGEL010000001.1 Elaeidobius kamerunicus isolate PL Ekam 1 scaffold-1, whole genome shotgun sequence
taaaacaataaaaatactttattttaatattcaatattgtattaatatataacttaattttctctattttaactaatttt
caaACCCCTAACATGTTTTCCAGTGAGccgctaaaaaaatatcacaaaatgaactttaagtttaagttGGAAATTTAAGA
CTTGAAGCTAGCTAGGATGAGTCGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAAATACATCTTTGATTTG
TAAGTATctgtagtatatttttggaataaaatagtttattaaatatatttcggtTTTCCTTTTCCCGTAGGACGTTGCAA
AGTGGCGACgaggatttttatatttccctaGAAAAATAGAACCCCCTAGTTGGGAAAATTAGTGGGTTTCTAAAATTCCG
GTAAAGTAAGAAAACGTGTAGTGTAGTGTGCAGATAGAATTTGaccctaaaataaatgattggACTGTGCACATAAATCG
TCTGATGATTCTATAAACAGACCAAAAAGAGTAATTTTACTCAATGGGCTTGCTCAAGAACCGTATATATTGCTACAAAA
CTTAAGTCTTTCAACGAAACCTTCTGAAGCTACTTACTAGGACCTTCTCAAGTACTTTAATAGCTATTTAAAGTTTTCCG
ATTACAAGGACTTCGATGAGGTCGAGGTAGAGAAAACGCTGGCAACCGTGGAGGCCGAGTGCATTTTGGAGGCGGTACTA
GTGACCGGTCGGGCAGCGCAGGAACGAAAAAATCAAGTGATAGTGATGTGTTCAATCTGTCGAAAAAGTAAACATTCCga
aaacaaatgttttcatCGTAATTTTAACAGGTTTTGCAGCTTCTGCAAACTAAAGCACATAATACagtaaactgtaaaaa
taaaatggacattgaacaaaataccaataatGACAATgtgaatgatttaaattttaatataaatttaaatgaatttccgG
TCTATACCACATAACATTTCTAGTcctattgaaatattttttaaataatagtctGTATAATTTTGAACTGATTCAGGTGC
AGTACTTTCGTGTACACCCTATTCGAtgtatgcaaattatttttgagatattttctTGATTAAAACTTATGTAACATGAT
TAATTTGAGTGGTAGAATAATTTCACCAATTGGTCAAGTTGTCCTAAAGctggaatataataaacaagtttCGAATTTAA
CGCGTTGCAATATAAAAGAGAGTAATATACCCTTGCTGGGACGAGATTTCATTGCAGAATTTAAGCAATTATTTAGGGTG
taagcaaattaattatataactaaacatagttttgatattaaataaaataggatgtttgaataaaatttaattaaaatta
tttctgaagGAAAGTTATGTTCCAAAATTTATCCAGCCCAATTAGAGTTAGATAGGTTGGTTGTTTCTGGAATTATTACT
CCAGTTAAGCATTCAGACTTGGGAACACCAATTGTCCATGTTCTAAAAGAAGATGGCTCTGTTCGCATTTGTGGTGACga
gaaaataacattaaatccatttttatagaattatagaaatatagcCCTTCAGCAAATTACGAAATATTATATCAGTATAC
TTTGCCTTGCATAAATAATTCTGCACTCAGAATAGATGCACAatttatgttcaaaatatttaaaaaaatacaaaaaaatg
caagagAGAAAGACGGTTTTAAGGATATGCATTTAACtagtttttattagtattgttTTTGATCTAAATTTTAACGTAAA
CATACTCGTTAAGTAAGTTAATTttgggttttaaaaaagtaagccTTAAGGAAGaggttattgtattttagataaatatt
tctacagCGCAAGGATAAATTTAAGTTCCGTAAATATCAGTTCCCATTTATTACTGCCATTTTTAAACGCatttataatg
caaaaaaataataaggagaTTTTTCATTAACCTTCAACAGATTATTAGGTTTATCAGCAAATCGGGGACGATTTCATTaa

これこのままかけてよくね?

というわけでこの生データをBRAKERにかける

kosukesano@at139:~/tools/for_softmask/nama_data/Ekam_data/GCA_014849505.1$ cp GCA_014849505.1_AAL_Ekam_1.0_genomic.fna ~/tools/for_braker/nama_data/
kosukesano@at139:~/tools/for_softmask/nama_data/Ekam_data/GCA_014849505.1$ cd ~/tools/for_braker/nama_data/
kosukesano@at139:~/tools/for_braker/nama_data$ ls
231117_Madara_softmasked.fasta            GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked.gz  busco_downloads           femo_busco.sh.po26221930    kohuki_busco.sh.po26238968
BUSCO_OUTPUT_FEMO_GENOME                  Madara_RNAseq                                       femo_busco.sh             kohuki_busco.sh             kohuki_softmasked.fasta
BUSCO_OUTPUT_KOHUKI_GENOME                Sfem_RNAseq                                         femo_busco.sh.e26221930   kohuki_busco.sh.e26238968   length.txt
Elaeidobius_kamerunicus.masked.fna        Sfem_pilon_softmasked.fasta                         femo_busco.sh.o26221930   kohuki_busco.sh.o26238968   madaralength.txt
GCA_014849505.1_AAL_Ekam_1.0_genomic.fna  Sfem_softmasked.fasta                               femo_busco.sh.pe26221930  kohuki_busco.sh.pe26238968
kosukesano@at139:~/tools/for_braker/nama_data$ mv GCA_014849505.1_AAL_Ekam_1.0_genomic.fna Ekam_NotUseEDTA.fna
kosukesano@at139:~/tools/for_braker/nama_data$ ls
231117_Madara_softmasked.fasta      GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked.gz  busco_downloads           femo_busco.sh.po26221930    kohuki_busco.sh.po26238968
BUSCO_OUTPUT_FEMO_GENOME            Madara_RNAseq                                       femo_busco.sh             kohuki_busco.sh             kohuki_softmasked.fasta
BUSCO_OUTPUT_KOHUKI_GENOME          Sfem_RNAseq                                         femo_busco.sh.e26221930   kohuki_busco.sh.e26238968   length.txt
Ekam_NotUseEDTA.fna                 Sfem_pilon_softmasked.fasta                         femo_busco.sh.o26221930   kohuki_busco.sh.o26238968   madaralength.txt
Elaeidobius_kamerunicus.masked.fna  Sfem_softmasked.fasta                               femo_busco.sh.pe26221930  kohuki_busco.sh.pe26238968
kosukesano@at139:~/tools/for_braker/nama_data$ 

これでかけたけど同じエラーが出た。

じゃあ全部大文字にしてやんよ!

~/tools/for_softmask/nama_data/Ekam_data/GCA_014849505.1ディレクトリ内で以下のコードをかいた。

### oomoji.py

def convert_lowercase_to_uppercase(input_file, output_file):
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        for line in infile:
            if line.startswith('>'):
                # ヘッダー行はそのまま出力
                outfile.write(line)
            else:
                # 塩基配列の小文字を大文字に置換
                outfile.write(line.upper())

input_file = '/home/kosukesano/tools/for_softmask/nama_data/Ekam_data/GCA_014849505.1/GCA_014849505.1_AAL_Ekam_1.0_genomic.fna'
output_file = '/home/kosukesano/tools/for_softmask/nama_data/Ekam_data/GCA_014849505.1/GCA_014849505.1_AAL_Ekam_1.0_genomic_upper.fna'

convert_lowercase_to_uppercase(input_file, output_file)

というわけでこれの出力でBRAKERをかける

kosukesano@at139:~/tools/for_softmask/nama_data/Ekam_data/GCA_014849505.1$ cp GCA_014849505.1_AAL_Ekam_1.0_genomic_upper.fna ~/tools/for_braker/nama_data/
kosukesano@at139:~/tools/for_softmask/nama_data/Ekam_data/GCA_014849505.1$ cd ~/tools/for_braker/nama_data/
kosukesano@at139:~/tools/for_braker/nama_data$ ls
231117_Madara_softmasked.fasta      GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked.gz  Sfem_softmasked.fasta    femo_busco.sh.pe26221930   kohuki_busco.sh.pe26238968
BUSCO_OUTPUT_FEMO_GENOME            GCA_014849505.1_AAL_Ekam_1.0_genomic_upper.fna      busco_downloads          femo_busco.sh.po26221930   kohuki_busco.sh.po26238968
BUSCO_OUTPUT_KOHUKI_GENOME          Madara_RNAseq                                       femo_busco.sh            kohuki_busco.sh            kohuki_softmasked.fasta
Ekam_NotUseEDTA.fna                 Sfem_RNAseq                                         femo_busco.sh.e26221930  kohuki_busco.sh.e26238968  length.txt
Elaeidobius_kamerunicus.masked.fna  Sfem_pilon_softmasked.fasta                         femo_busco.sh.o26221930  kohuki_busco.sh.o26238968  madaralength.txt
kosukesano@at139:~/tools/for_braker/nama_data$ mv GCA_014849505.1_AAL_Ekam_1.0_genomic_upper.fna Ekam_oomoji.fna
kosukesano@at139:~/tools/for_braker/nama_data$ ls
231117_Madara_softmasked.fasta  Elaeidobius_kamerunicus.masked.fna                  Sfem_softmasked.fasta    femo_busco.sh.pe26221930   kohuki_busco.sh.pe26238968
BUSCO_OUTPUT_FEMO_GENOME        GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked.gz  busco_downloads          femo_busco.sh.po26221930   kohuki_busco.sh.po26238968
BUSCO_OUTPUT_KOHUKI_GENOME      Madara_RNAseq                                       femo_busco.sh            kohuki_busco.sh            kohuki_softmasked.fasta
Ekam_NotUseEDTA.fna             Sfem_RNAseq                                         femo_busco.sh.e26221930  kohuki_busco.sh.e26238968  length.txt
Ekam_oomoji.fna                 Sfem_pilon_softmasked.fasta                         femo_busco.sh.o26221930  kohuki_busco.sh.o26238968  madaralength.txt
kosukesano@at139:~/tools/for_braker/nama_data$ cd ../Ekam/
kosukesano@at139:~/tools/for_braker/Ekam$ ls
Ekam_braker.sh            Ekam_braker.sh.e27018371  Ekam_braker.sh.o27018371   Ekam_braker.sh.pe27018371  Ekam_braker.sh.po27018371
Ekam_braker.sh.e27004779  Ekam_braker.sh.o27004779  Ekam_braker.sh.pe27004779  Ekam_braker.sh.po27004779  braker
Ekam_braker.sh.e27018010  Ekam_braker.sh.o27018010  Ekam_braker.sh.pe27018010  Ekam_braker.sh.po27018010  gputest
kosukesano@at139:~/tools/for_braker/Ekam$ rm -r braker/
kosukesano@at139:~/tools/for_braker/Ekam$ nano Ekam_braker.sh
kosukesano@at139:~/tools/for_braker/Ekam$ qsub Ekam_braker.sh

これでも同じエラーが出てるんだけど ……。

デバッグのため、もう一度マダラのゲノムにbrakerをかける。これで同じエラーが出るならbrakerの環境が悪い。

~/tools/for_braker/241013_for_debag_madaraディレクトリを用意し、madara_braker.shをコピーしてqsubで実行。

Pstrのソフトマスク、マスキングの復元

Ekamに倣ってマスクを戻す。

### oomoji.py

def convert_lowercase_to_uppercase(input_file, output_file):
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        for line in infile:
            if line.startswith('>'):
                # ヘッダー行はそのまま出力
                outfile.write(line)
            else:
                # 塩基配列の小文字を大文字に置換
                outfile.write(line.upper())

input_file = "/home/kosukesano/tools/for_softmask/nama_data/Pstr_data/GCA_016904865.1/GCA_016904865.1_GSC_weevil_1.0_genomic.fna"
output_file = "/home/kosukesano/tools/for_softmask/nama_data/Pstr_data/GCA_016904865.1/Pstr_oomoji.fna"

convert_lowercase_to_uppercase(input_file, output_file)

大文字に戻して、これを自分でソフトマスクする。それ用のディレクトリを作成。

kosukesano@at139:~/tools/for_softmask$ mkdir Pstr_oomoji_softmask

データベース作成

(EDTA2) kosukesano@at137:~/tools/for_softmask/Pstr_oomoji_softmask$ BuildDatabase -name Pstr_BLAST_DATABASE_PREFIX /home/kosukesano/tools/for_softmask/nama_data/Pstr_data/GCA_016904865.1/Pstr_oomoji.fna
Building database Pstr_BLAST_DATABASE_PREFIX:
  Reading /home/kosukesano/tools/for_softmask/nama_data/Pstr_data/GCA_016904865.1/Pstr_oomoji.fna...
Number of sequences (bp) added to database: 84140 ( 2025024129 bp )
(EDTA2) kosukesano@at137:~/tools/for_softmask/Pstr_oomoji_softmask$ 

RepeatModeler

#$ -S /bin/bash
#$ -cwd
#$ -l intel
echo start at
date

source ~/tools/pyenv_env/EDTA_profile

RepeatModeler -database Pstr_BLAST_DATABASE_PREFIX  -pa 6
date

これをqsubで投げた。コア数とかは調整してもいいかも。

1015

Pstrのソフトマスク

Pstrもすでにマスクされてるみたいなので必要あるかわからないけど一応やっとく。

#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 24
#$ -l s_vmem=12G
#$ -l mem_req=12G

echo start at
date

source ~/tools/pyenv_env/EDTA_profile
RepeatModeler -database Pstr_BLAST_DATABASE_PREFIX  -pa 6
date

Pstrbraker実行

~/tools/for_braker/Pstrを作り、Pstr_braker.shを作成。

試しにローカルで実行してみる。

scorpionでのオジロソフトマスク

dendezia@scorpion:~/tool/for_softmask/Ojiro_softmask$ source /home/dendezia/tool/pyenv_env/EDTA_profile
(EDTA2) dendezia@scorpion:~/tool/for_softmask/Ojiro_softmask$ BuildDatabase -name Ojiro_BLAST_DATABASE ../nama_data/Release_241005-ojiro_hifiasm/out.p_ctg.fa
Building database Ojiro_BLAST_DATABASE:
  Reading ../nama_data/Release_241005-ojiro_hifiasm/out.p_ctg.fa...
Number of sequences (bp) added to database: 328 ( 736756452 bp )
(EDTA2) dendezia@scorpion:~/tool/for_softmask/Ojiro_softmask$ ls
Ojiro_BLAST_DATABASE.nhr  Ojiro_BLAST_DATABASE.njs  Ojiro_BLAST_DATABASE.nni  Ojiro_BLAST_DATABASE.nsq
Ojiro_BLAST_DATABASE.nin  Ojiro_BLAST_DATABASE.nnd  Ojiro_BLAST_DATABASE.nog  Ojiro_BLAST_DATABASE.translation
(EDTA2) dendezia@scorpion:~/tool/for_softmask/Ojiro_softmask$ nano Ojiro_RepeatModeler.sh
(EDTA2) dendezia@scorpion:~/tool/for_softmask/Ojiro_softmask$ nano Ojiro_RepeatModeler.sh
(EDTA2) dendezia@scorpion:~/tool/for_softmask/Ojiro_softmask$ qsub Ojiro_RepeatModeler.sh 
2021.scorpion
(EDTA2) dendezia@scorpion:~/tool/for_softmask/Ojiro_softmask$ qstat
Job id            Name             User              Time Use S Queue
----------------  ---------------- ----------------  -------- - -----
2021.scorpion     Ojiro_RepeatMod* dendezia                 0 R batch           
(EDTA2) dendezia@scorpion:~/tool/for_softmask/Ojiro_softmask$ 

遺伝研でのオジロのソフトマスク

遺伝研でも入りそうなので、どっちが早いかわからないけどとりあえず入れてみる

オジロのゲノムデータを遺伝研に転送

:~/Downloads$ scp ~/Downloads/Release_241005-ojiro_hifiasm.tar.gz kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_softmask/nama_data
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
|  ..o.o...*   o+ |
|   . . ..= + o* o|
|       .  = oB +.|
|      +.oo .+E+o.|
|      .*S.  o.o.+|
|      .o. .  . .+|
|      ..   +  . o|
|        . ..oo . |
|         . .=.   |
+----[SHA256]-----+
Release_241005-ojiro_hifiasm.tar.gz                                                                                                                            100% 2342MB  90.8MB/s   00:25    
:~/Downloads$ 

遺伝研でそれを解凍

kosukesano@at139:~/tools/for_softmask/nama_data$ tar -xzf Release_241005-ojiro_hifiasm.tar.gz
kosukesano@at139:~/tools/for_softmask/nama_data$ ls
231117_madaragenome.fasta            Madara_ProcessRepeats.sh.e26141043   Madara_busco.sh             Madara_busco.sh.po26146490           Sfem_ProcessRepeats.sh.pe26141247  core.64019
231117_madaragenome.fasta.cat.gz     Madara_ProcessRepeats.sh.e26141224   Madara_busco.sh.e26144664   Pstr_data                            Sfem_ProcessRepeats.sh.po26141154  core.65135
231117_madaragenome.fasta.fasta.out  Madara_ProcessRepeats.sh.e26141230   Madara_busco.sh.e26144679   Pstr_ncbi_dataset.zip                Sfem_ProcessRepeats.sh.po26141247  core.65374
231117_madaragenome.fasta.masked     Madara_ProcessRepeats.sh.o26141043   Madara_busco.sh.e26146490   README.md                            Sfem_assembly.fasta                core.65380
231117_madaragenome.fasta.out        Madara_ProcessRepeats.sh.o26141224   Madara_busco.sh.o26144664   Release_241005-ojiro_hifiasm         Sfem_assembly.fasta.cat.gz         core.65483
231117_madaragenome.fasta.out.gff    Madara_ProcessRepeats.sh.o26141230   Madara_busco.sh.o26144679   Release_241005-ojiro_hifiasm.tar.gz  Sfem_assembly.fasta.masked         core.65491
231117_madaragenome.fasta.tbl        Madara_ProcessRepeats.sh.pe26141043  Madara_busco.sh.o26146490   Sfem_ProcessRepeats.sh               Sfem_assembly.fasta.out            md5sum.txt
BUSCO_OUTPUT_FEMO                    Madara_ProcessRepeats.sh.pe26141224  Madara_busco.sh.pe26144664  Sfem_ProcessRepeats.sh.e26141154     Sfem_assembly.fasta.out.gff
BUSCO_OUTPUT_MADARA                  Madara_ProcessRepeats.sh.pe26141230  Madara_busco.sh.pe26144679  Sfem_ProcessRepeats.sh.e26141247     Sfem_assembly.fasta.tbl
Ekam_data                            Madara_ProcessRepeats.sh.po26141043  Madara_busco.sh.pe26146490  Sfem_ProcessRepeats.sh.o26141154     Sfem_pilon
Ekam_ncbi_dataset.zip                Madara_ProcessRepeats.sh.po26141224  Madara_busco.sh.po26144664  Sfem_ProcessRepeats.sh.o26141247     busco_1897137032.log
Madara_ProcessRepeats.sh             Madara_ProcessRepeats.sh.po26141230  Madara_busco.sh.po26144679  Sfem_ProcessRepeats.sh.pe26141154    busco_downloads
kosukesano@at139:~/tools/for_softmask/nama_data$ ls Release_241005-ojiro_hifiasm
hifiasm.sh                   out.bp.hap2.p_ctg.gfa        out.bp.p_ctg.lowQ.bed   out.bp.p_utg.noseq.gfa  out.hap1.p_ctg.fa             out.p_ctg.fa
out.bp.hap1.p_ctg.gfa        out.bp.hap2.p_ctg.lowQ.bed   out.bp.p_ctg.noseq.gfa  out.bp.r_utg.gfa        out.hap1.p_ctg.fa.sort.fasta  slurm-3615.out
out.bp.hap1.p_ctg.lowQ.bed   out.bp.hap2.p_ctg.noseq.gfa  out.bp.p_utg.gfa        out.bp.r_utg.lowQ.bed   out.hap2.p_ctg.fa             stats.txt
out.bp.hap1.p_ctg.noseq.gfa  out.bp.p_ctg.gfa             out.bp.p_utg.lowQ.bed   out.bp.r_utg.noseq.gfa  out.hap2.p_ctg.fa.sort.fasta
kosukesano@at139:~/tools/for_softmask/nama_data$

データベース作成

kosukesano@at137:~$ source ~/tools/pyenv_env/EDTA_profile
(EDTA2) kosukesano@at137:~$ cd tools/for_softmask/Ojiro_softmask/
(EDTA2) kosukesano@at137:~/tools/for_softmask/Ojiro_softmask$ ls
(EDTA2) kosukesano@at137:~/tools/for_softmask/Ojiro_softmask$ BuildDatabase -name Ojiro_BLAST_DATABASE ../nama_data/Release_241005-ojiro_hifiasm/out.p_ctg.fa
Building database Ojiro_BLAST_DATABASE:
  Reading ../nama_data/Release_241005-ojiro_hifiasm/out.p_ctg.fa...
Number of sequences (bp) added to database: 328 ( 736756452 bp )
(EDTA2) kosukesano@at137:~/tools/for_softmask/Ojiro_softmask$ ls
Ojiro_BLAST_DATABASE.nhr  Ojiro_BLAST_DATABASE.njs  Ojiro_BLAST_DATABASE.nni  Ojiro_BLAST_DATABASE.nsq
Ojiro_BLAST_DATABASE.nin  Ojiro_BLAST_DATABASE.nnd  Ojiro_BLAST_DATABASE.nog  Ojiro_BLAST_DATABASE.translation
(EDTA2) kosukesano@at137:~/tools/for_softmask/Ojiro_softmask$ 

ASTRAL系統樹を用いたbranchモデルのcodeml

  • b_free: #1を振った特定の枝でdN/dSが異なるというモデル
  • b_neut: 全ての枝でdN/dSが1であるというモデル
  • M0: 全ての枝でdN/dSが一定であるというモデル

この3つのモデルを以下の方法で比較する。

  • b_free vs b_neut: #1を振った特定の枝でdN/dSが1と異なるか。 BackgroundはdN/dS=1で、ForegroundはdN/dS≠1。
  • b_free vs M0: #1を振った特定の枝でdN/dSが他と異なるか。Backgroundは0\<dN/dS\<1で、ForegroundはdN/dS≠1。

このうち、branch-siteでは見れないdN/dS<1が見れると良いな

b_free

~/tools/for_paml/241009_ASTRAL_6sp/b_freeディレクトリを作成。その下で以下の2つのスクリプトを書いた。

(EDTA2) kosukesano@at137:~/tools/for_paml/241009_ASTRAL_6sp/b_free$ ls
b_free_ASTRAL_paml.sh  template.ctl
(EDTA2) kosukesano@at137:~/tools/for_paml/241009_ASTRAL_6sp/b_free$ 
### b_free_ASTRAL_paml.sh

#$ -S /bin/bash
#$ -cwd
#$ -l gpu

# ディレクトリの設定
input_dir="/home/kosukesano/tools/for_paml/data/CDS_SCO"
bsA_dir="/home/kosukesano/tools/for_paml/241009_ASTRAL_6sp/b_free"
result_dir="$bsA_dir/result"
template_ctl="$bsA_dir/template.ctl"

# 出力ディレクトリが存在しない場合は作成
mkdir -p "$result_dir"

# テンプレートの制御ファイルを読み込む
ctl_template=$(cat "$template_ctl")

# ディレクトリ内の_maffted_fixed.fastaファイルを処理
for file in "$input_dir"/*_maffted_fixed.fna; do
  if [[ -f "$file" ]]; then
    base_name=$(basename "$file" .fna)
    outfile_path="$result_dir/${base_name}_b_free"

    # 一時的な制御ファイルの内容を生成
    ctl_content="${ctl_template//<SEQFILE>/$file}"
    ctl_content="${ctl_content//<OUTFILE>/$outfile_path}"

    # 一時的な制御ファイルを作成
    ctl_path="$bsA_dir/bsA.ctl"
    echo "$ctl_content" > "$ctl_path"

    # PAMLを実行
    singularity exec -e /usr/local/biotools/p/paml:4.9--h779adbc_6 codeml "$ctl_path"

    echo "Processed file: $file, output: $outfile_path"
  fi
done
### template.ctl

seqfile = <SEQFILE>
treefile = /home/kosukesano/tools/for_paml/ASTRAL_6sp/data/new_tree_ASTRAL_ultrametric.nwk
outfile = <OUTFILE>

model = 2           * 記号の有無で異なる ω を推定
NSsites = 0           * サイト間では ω は一定
fix_omega = 0           * ω の値を配列から推定
omega = 1           * 推定は ω=1 からスタート


noisy = 9
verbose = 1
runmode = 0
seqtype = 1
CodonFreq = 2
clock = 0
icode = 0
fix_kappa = 0
kappa = 2
fix_alpha = 1
alpha = .0
Malpha = 0
ncatG = 4
getSE = 0
RateAncestor = 0
method = 0
fix_blength = 0

これをqsubで投げた。

b_neut

### b_neut_ASTRAL_paml.sh
#$ -S /bin/bash
#$ -cwd
#$ -l gpu

# ディレクトリの設定
input_dir="/home/kosukesano/tools/for_paml/data/CDS_SCO"
bsA_dir="/home/kosukesano/tools/for_paml/241009_ASTRAL_6sp/b_neut"
result_dir="$bsA_dir/result"
template_ctl="$bsA_dir/template.ctl"

# 出力ディレクトリが存在しない場合は作成
mkdir -p "$result_dir"

# テンプレートの制御ファイルを読み込む
ctl_template=$(cat "$template_ctl")

# ディレクトリ内の_maffted_fixed.fastaファイルを処理
for file in "$input_dir"/*_maffted_fixed.fna; do
  if [[ -f "$file" ]]; then
    base_name=$(basename "$file" .fna)
    outfile_path="$result_dir/${base_name}_b_neut"

    # 一時的な制御ファイルの内容を生成
    ctl_content="${ctl_template//<SEQFILE>/$file}"
    ctl_content="${ctl_content//<OUTFILE>/$outfile_path}"

    # 一時的な制御ファイルを作成
    ctl_path="$bsA_dir/bsA.ctl"
    echo "$ctl_content" > "$ctl_path"

    # PAMLを実行
    singularity exec -e /usr/local/biotools/p/paml:4.9--h779adbc_6 codeml "$ctl_path"

    echo "Processed file: $file, output: $outfile_path"
  fi
done
### template.ctl

seqfile = <SEQFILE>
treefile = /home/kosukesano/tools/for_paml/ASTRAL_6sp/data/new_tree_ASTRAL_ultrametric.nwk
outfile = <OUTFILE>

model = 2           * 記号の有無で異なる ω を推定
NSsites = 0           * サイト間では ω は一定
fix_omega = 1            * ω の値を固定
omega = 1           * 推定は ω=1 からスタート


noisy = 9
verbose = 1
runmode = 0
seqtype = 1
CodonFreq = 2
clock = 0
icode = 0
fix_kappa = 0
kappa = 2
fix_alpha = 1
alpha = .0
Malpha = 0
ncatG = 4
getSE = 0
RateAncestor = 0
method = 0
fix_blength = 0

M0

### M0_ASTRAL_paml.sh

#$ -S /bin/bash
#$ -cwd
#$ -l gpu

# ディレクトリの設定
input_dir="/home/kosukesano/tools/for_paml/data/CDS_SCO"
bsA_dir="/home/kosukesano/tools/for_paml/241009_ASTRAL_6sp/M0"
result_dir="$bsA_dir/result"
template_ctl="$bsA_dir/template.ctl"

# 出力ディレクトリが存在しない場合は作成
mkdir -p "$result_dir"

# テンプレートの制御ファイルを読み込む
ctl_template=$(cat "$template_ctl")

# ディレクトリ内の_maffted_fixed.fastaファイルを処理
for file in "$input_dir"/*_maffted_fixed.fna; do
  if [[ -f "$file" ]]; then
    base_name=$(basename "$file" .fna)
    outfile_path="$result_dir/${base_name}_m_zero"

    # 一時的な制御ファイルの内容を生成
    ctl_content="${ctl_template//<SEQFILE>/$file}"
    ctl_content="${ctl_content//<OUTFILE>/$outfile_path}"

    # 一時的な制御ファイルを作成
    ctl_path="$bsA_dir/bsA.ctl"
    echo "$ctl_content" > "$ctl_path"

    # PAMLを実行
    singularity exec -e /usr/local/biotools/p/paml:4.9--h779adbc_6 codeml "$ctl_path"

    echo "Processed file: $file, output: $outfile_path"
  fi
done
### template.sh

seqfile = <SEQFILE>
treefile = /home/kosukesano/tools/for_paml/ASTRAL_6sp/data/new_tree_ASTRAL_ultrametric.nwk
outfile = <OUTFILE>

model = 0
NSsites = 0           * サイト間では ω は一定
fix_omega = 0           * ω の値を配列から推定
omega = 1           * 推定は ω=1 からスタート


noisy = 9
verbose = 1
runmode = 0
seqtype = 1
CodonFreq = 2
clock = 0
icode = 0
fix_kappa = 0
kappa = 2
fix_alpha = 1
alpha = .0
Malpha = 0
ncatG = 4
getSE = 0
RateAncestor = 0
method = 0
fix_blength = 0

IQTREE系統樹を用いたbranchモデルのcodeml

ASTRALのと同じことをした。

尤度比検定

### free_vs_neut_lrp.py

import os
import re
from scipy.stats import chi2

def parse_lnL(file_path):
    try:
        with open(file_path, 'r') as f:
            for line in f:
                print(f"Processing line: {line.strip()}")  # デバッグ用
                match = re.search(r'lnL\(ntime:\s*\d+\s+np:\s*(\d+)\):\s+(-?\d+\.\d+)', line)
                if match:
                    np = int(match.group(1))
                    lnL = float(match.group(2))
                    return np, lnL
        print(f"{file_path} に 'lnL' 行が見つかりませんでした。形式を確認してください。")
        return None, None
    except Exception as e:
        print(f"{file_path} を開く際にエラーが発生しました: {e}")
        return None, None

def parse_w_ratios(file_path):
    try:
        with open(file_path, 'r') as f:
            content = f.read()
            match = re.search(r'Smad #(\d+\.\d+)', content)
            if match:
                smad_w_ratio = float(match.group(1))
                return smad_w_ratio
        print(f"{file_path} に 'Smad' の w ratio が見つかりませんでした。")
        return None
    except Exception as e:
        print(f"{file_path} の w ratio 抽出中にエラーが発生しました: {e}")
        return None

def perform_lrt(alt_lnL, alt_np, null_lnL, null_np):
    try:
        lr_stat = 2 * (alt_lnL - null_lnL)
        df = alt_np - null_np
        p_val = chi2.sf(lr_stat, df)
        return p_val
    except Exception as e:
        print(f"LRT計算中にエラーが発生しました: {e}")
        return None

def main():
    alt_dir = '/home/kosukesano/tools/for_paml/241009_ASTRAL_6sp/b_free/result'
    null_dir = '/home/kosukesano/tools/for_paml/241009_ASTRAL_6sp/b_neut/result'
    output_file = 'ASTRAL_free_vs_neut_lrt_results_with_w_ratios.txt'

    alt_dir = os.path.expanduser(alt_dir)
    null_dir = os.path.expanduser(null_dir)

    # 処理するOGファイルリストの取得
    og_files = [f for f in os.listdir(alt_dir) if '_maffted_fixed_b_free' in f]

    with open(output_file, 'w') as out_f:
        out_f.write('OG_num\tp_val\tpositive_selection\tSmad_w_ratio\n')

        # 各OGファイルについてループ処理
        for idx, og_file in enumerate(og_files):
            og_num = og_file.split('_')[0]
            alt_file = os.path.join(alt_dir, og_file)
            null_file = os.path.join(null_dir, og_file.replace('_maffted_fixed_b_free', '_maffted_fixed_b_neut'))

            print(f"{idx+1}/{len(og_files)}: {og_num} の解析を開始します...")

            if os.path.exists(null_file):
                alt_np, alt_lnL = parse_lnL(alt_file)
                null_np, null_lnL = parse_lnL(null_file)
                smad_w_ratio = parse_w_ratios(alt_file)

                if alt_np is not None and null_np is not None:
                    p_val = perform_lrt(alt_lnL, alt_np, null_lnL, null_np)
                    if p_val is not None:
                        reject_null = '+' if p_val < 0.05 else '-'
                        out_f.write(f'{og_num}\t{p_val}\t{reject_null}\t{smad_w_ratio}\n')
                        print(f"{og_num} の解析が完了しました。p値: {p_val}, 正の選択: {reject_null}, Smadのw ratio: {smad_w_ratio}")
                    else:
                        print(f"{og_num} のLRT計算に失敗しました。")
                else:
                    print(f"{og_num} のlnLデータが不完全です。")
            else:
                print(f"{og_num} の対応するnullモデルファイルが見つかりませんでした。")

if __name__ == "__main__":
    main()

これを実行した。

1017

遺伝研環境でのオジロのソフトマスク続き

RepeatModelerが終わったので、続いてRepeatMaskerをかける。

以下のスクリプトを書いてqsubで投げた。

### Ojiro_RepeatMasker.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 24
#$ -l s_vmem=12G
#$ -l mem_req=12G

echo start at
date

source /home/kosukesano/tools/pyenv_env/EDTA_profile

RepeatMasker -pa 6 -lib\
        /home/kosukesano/tools/for_softmask/Ojiro_softmask/RM_3181478.TueOct151949192024/consensi.fa.classified\
        /home/kosukesano/tools/for_softmask/nama_data/Release_241005-ojiro_hifiasm/out.p_ctg.fa

date

scorpion環境でのオジロのソフトマスク続き

遺伝研環境と同様、RepeatModelerが終わったので、RepeatMaskerをかける。

以下のスクリプトを書いてqsubで投げた。

### scorpionでのOjiro_RepeatMasker.shの中身

#$ -S /bin/bash
#$ -cwd

cd /home/dendezia/tool/for_softmask/Ojiro_softmask/

source /home/dendezia/tool/pyenv_env/EDTA_profile

RepeatMasker -pa 6 -lib\
        /home/dendezia/tool/for_softmask/Ojiro_softmask/RM_1996100.TueOct150214432024/consensi.fa.classified\
        -xsmall\
        /home/dendezia/tool/for_softmask/nama_data/Release_241005-ojiro_hifiasm/out.p_ctg.fa

date

遺伝研との変更点として、-xmallというオプションをつけてみた。Metalさんのサイトだと「反復配列を小文字にするsoft mask。 デフォルトでは N に置き換えるhard mask。」と説明されている。これを使えばわざわざProcessRepeatsのスクリプトを書かなくても良くなる?

これについて、遺伝研のがqwの間にオジロのRepeatMaskerが終わってた。 しかもソフトマスクまで終了している!

dendezia@scorpion:~/tool/for_softmask/Ojiro_softmask$ ls ../nama_data/
Ekam_dataset  Release_241005-ojiro_hifiasm  Release_241005-ojiro_hifiasm.tar.gz
dendezia@scorpion:~/tool/for_softmask/Ojiro_softmask$ ls ../nama_data/Release_241005-ojiro_hifiasm
hifiasm.sh                   out.bp.hap2.p_ctg.gfa        out.bp.p_ctg.lowQ.bed   out.bp.p_utg.noseq.gfa  out.hap1.p_ctg.fa             out.p_ctg.fa         out.p_ctg.fa.tbl
out.bp.hap1.p_ctg.gfa        out.bp.hap2.p_ctg.lowQ.bed   out.bp.p_ctg.noseq.gfa  out.bp.r_utg.gfa        out.hap1.p_ctg.fa.sort.fasta  out.p_ctg.fa.cat.gz  slurm-3615.out
out.bp.hap1.p_ctg.lowQ.bed   out.bp.hap2.p_ctg.noseq.gfa  out.bp.p_utg.gfa        out.bp.r_utg.lowQ.bed   out.hap2.p_ctg.fa             out.p_ctg.fa.masked  stats.txt
out.bp.hap1.p_ctg.noseq.gfa  out.bp.p_ctg.gfa             out.bp.p_utg.lowQ.bed   out.bp.r_utg.noseq.gfa  out.hap2.p_ctg.fa.sort.fasta  out.p_ctg.fa.out
dendezia@scorpion:~/tool/for_softmask/Ojiro_softmask$

RepeatMaskerの直接の出力であるout.p_ctg.fa.cat.gzの他に、ProcessRepeatsの出力であるout.p_ctg.fa.maskedもいる!

### out.p_ctg.fa.maskedの中身の一部

>ptg000001l
TAGCAGTATCGAGTATAATCATAATATCGTAGTTTTATTGCTAAAACTGT
CCTTTCAACTAATAGTTAGGTATAGATATTCACATATGCATTTTCATTTT
TAAATAAATCTTCGATACTCTGTAATCAATTTCCATTTTTGTTCTATCCC
AAATTATATAAAGTATATAATTTTCTATGTTTTTTTGGTGGAGTGTTCGC
AAAGGGCTGTGACTTGAAGGATGCGTCTTAATCTCGAGGAATATAATGAA
GCAAATGTATCTGCATTAATCTTCTTCTATCTAGTGAGTTGAAATATAAT
GTGGGGTATTATAACAATGACGCAGTAGTAAATAAAAATAAATCAAATCG
ACTTACGTCGATATAAAGTATACTAATTAAAAACATAAAGTCAATCTCGC
AAAAGCAAATATAAGTTAATACATATTAGATATAAATTTGTCCAGATATA
TTAAAATGGCTATTAGTCATTTCTTGACACGGGAtaattaataattaatt
tttcattaaattaaCATACTAAGAAAAACCAGACATCAGACCCAGTTGGT
TTTTCAACTGAAGTGAAACAGTAATCTTAAGCAAATATATCAATAATCTA
ATATGAATTCCTACAAAATTATCTGCTTGAACCTAGAACAAGCTATGCCT
GCGTATATAACTTTAACCAGTTAAGTGACTTCATGCATATATTACTATGA
TTTTAACACCTAATTAGCCTAATGGCTTCTGCTTATGTTCAAAAGATTAC
ATCTAAGTCGATTTTCTTCTCATCGTCATAAGAGGATTAAAATATTCAAA
TTAATAATATCCAGAATGATCAATAAATTAACAAACGAAATTTTAAATTG
CCGTTGATCTAATGTGGTAAATGGGTATTATGTAATATTTTTCGACAGGG
GTGGTATGATCGAGTAATTCGTCAACTAGAAACTACAGTATATATTGTAT
CTGAGCTGAACGAAGTtacagggatatccatataaaagtaatgaatccta
ctttttattcttaaataaacgttatatataaaagttttggctattttgaa
acatttatatatcttacaaccaaaataattgtgcaacgaataatgaagta
gtataaaacatcgattttcttgcttacttaaattggacggtatggttttt
attgcatatttgtattctaGGAAAAATTATGAACGTTACATGTATTATGG

ちゃんとソフトマスクされてるっぽい。今後はこれでいいな。

オジロゲノムのBRAKER

マスキング後のオジロのゲノムは241017_Ojiro_masked.faとした。

まずオジロのゲノムをローカルに移動

:~/bio$ scp dendezia@scorpion:/home/dendezia/tool/for_softmask/nama_data/Release_241005-ojiro_hifiasm/out.p_ctg.fa.masked ~/bio/241017_Ojiro_masked.fa
Host key fingerprint is SHA256:KPa37JYErRVG/1YWy31gMOwAs13hHzUeg3opGD75qVY
+--[ED25519 256]--+
|       .+. .=o=+.|
|        o*.o.=.*+|
|       oo.*oo B.o|
|      ..o= +.* ..|
|    o .+S o * .  |
|   . o. .  E     |
|      ....o      |
|       oo+       |
|       o=        |
+----[SHA256]-----+
out.p_ctg.fa.masked                                                                                                                                            100%  717MB 111.5MB/s   00:06    
:~/bio$ ls
240903_ASTRAL.tre                                  DEG_Adult_vs_Larva_DESeq2.csv                      fastp.json
240903_ASTRAL_Optimal_tree.tre                     DEG_ovary_vs_body_DESeq2.csv                       femo_annotated
240910_ASTRAL.tre                                  GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked.gz for_blast_test
240910_ASTRAL_Optimal_tree.tre                     IQTREE_7sp.tre                                     for_cafe
240912_ASTRAL.tre                                  SRR11742112                                        for_eggnoc
240912_ASTRAL_Optimal_tree.tre                     SRR11742112_1.fastq                                for_paml
240917_6sp_withOneZero_ASTRAL.tre                  SRR11742112_2.fastq                                functional_annotation
240917_7sp_ASTRAL.tre                              SRR9665770                                         madara_annotated
240917_CO1.tre                                     SRR9665770_1.fastq                                 madara_braker.zip
240919_7sp                                         SRR9665770_2.fastq                                 memo.txt
241017_Ojiro_masked.fa                             SRR9665770_report1.html                            new_rbh.py
7sp.tre                                            braker_t1_sequences.aa.zip                         qc_SRR9665770_1.fq
CAFE前準備.R                                       drawtree.R                                         qc_SRR9665770_2.fq
:~/bio$ 

続いてこれを遺伝研環境に移動

:~/bio$ scp ~/bio/241017_Ojiro_masked.fa kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_braker/nama_data
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
|  ..o.o...*   o+ |
|   . . ..= + o* o|
|       .  = oB +.|
|      +.oo .+E+o.|
|      .*S.  o.o.+|
|      .o. .  . .+|
|      ..   +  . o|
|        . ..oo . |
|         . .=.   |
+----[SHA256]-----+
241017_Ojiro_masked.fa                                                                                                                                         100%  717MB 108.7MB/s   00:06    
:~/bio$ 

また、オジロはRNA-seqのデータもあるので、それらをすべて遺伝研に転送する。遺伝研の方でkosukesano/tools/for_braker/nama_data/Ojiro_RNAseqディレクトリを作り、そこにすべて転送した。

:/Volumes/Elements_1/240529_RNAseq/RawData$ scp /Volumes/Elements_1/240529_RNAseq/RawData/ojiro_*/*.gz kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_braker/nama_data/Ojiro_RNAseq
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
|  ..o.o...*   o+ |
|   . . ..= + o* o|
|       .  = oB +.|
|      +.oo .+E+o.|
|      .*S.  o.o.+|
|      .o. .  . .+|
|      ..   +  . o|
|        . ..oo . |
|         . .=.   |
+----[SHA256]-----+
ojiro_E1_1.fq.gz                                                                                                                                               100% 2176MB 107.7MB/s   00:20    
ojiro_E1_2.fq.gz                                                                                                                                               100% 2254MB 106.0MB/s   00:21    
ojiro_E2_1.fq.gz                                                                                                                                               100% 2379MB 110.1MB/s   00:21    
ojiro_E2_2.fq.gz                                                                                                                                               100% 2464MB 109.7MB/s   00:22    
ojiro_E3_1.fq.gz                                                                                                                                               100% 1858MB 107.4MB/s   00:17    
ojiro_E3_2.fq.gz                                                                                                                                               100% 1892MB  95.7MB/s   00:19    
ojiro_E4_1.fq.gz                                                                                                                                               100% 2184MB  88.2MB/s   00:24    
ojiro_E4_2.fq.gz                                                                                                                                               100% 2213MB  91.0MB/s   00:24    
ojiro_H1_1.fq.gz                                                                                                                                               100% 1510MB  87.7MB/s   00:17    
ojiro_H1_2.fq.gz                                                                                                                                               100% 1558MB 108.8MB/s   00:14    
ojiro_H2_1.fq.gz                                                                                                                                               100% 1860MB 110.6MB/s   00:16    
ojiro_H2_2.fq.gz                                                                                                                                               100% 1909MB 108.5MB/s   00:17    
ojiro_H3_1.fq.gz                                                                                                                                               100% 1620MB 107.8MB/s   00:15    
ojiro_H3_2.fq.gz                                                                                                                                               100% 1669MB 109.6MB/s   00:15    
ojiro_H4_1.fq.gz                                                                                                                                               100% 1846MB 108.7MB/s   00:16    
ojiro_H4_2.fq.gz                                                                                                                                               100% 1889MB 108.6MB/s   00:17    
ojiro_L1_1.fq.gz                                                                                                                                               100% 2072MB 110.0MB/s   00:18    
ojiro_L1_2.fq.gz                                                                                                                                               100% 2149MB 109.0MB/s   00:19    
ojiro_L2_1.fq.gz                                                                                                                                               100% 2166MB 108.3MB/s   00:19    
ojiro_L2_2.fq.gz                                                                                                                                               100% 2200MB  96.0MB/s   00:22    
ojiro_L3_1.fq.gz                                                                                                                                               100% 1838MB  90.3MB/s   00:20    
ojiro_L3_2.fq.gz                                                                                                                                               100% 1894MB  95.4MB/s   00:19    
ojiro_L4_1.fq.gz                                                                                                                                               100% 1868MB 107.0MB/s   00:17    
ojiro_L4_2.fq.gz                                                                                                                                               100% 1939MB  95.4MB/s   00:20    
ojiro_O1_1.fq.gz                                                                                                                                               100% 1603MB  93.4MB/s   00:17    
ojiro_O1_2.fq.gz                                                                                                                                               100% 1682MB 110.1MB/s   00:15    
ojiro_O2_1.fq.gz                                                                                                                                               100% 1707MB 108.7MB/s   00:15    
ojiro_O2_2.fq.gz                                                                                                                                               100% 1783MB 109.0MB/s   00:16    
ojiro_O3_1.fq.gz                                                                                                                                               100% 1499MB 103.8MB/s   00:14    
ojiro_O3_2.fq.gz                                                                                                                                               100% 1546MB 105.3MB/s   00:14    
ojiro_O4_1.fq.gz                                                                                                                                               100% 1865MB 109.5MB/s   00:17    
ojiro_O4_2.fq.gz                                                                                                                                               100% 1921MB 106.7MB/s   00:18    
ojiro_T1_1.fq.gz                                                                                                                                               100% 1891MB 109.8MB/s   00:17    
ojiro_T1_2.fq.gz                                                                                                                                               100% 1986MB 108.9MB/s   00:18    
ojiro_T2_1.fq.gz                                                                                                                                               100% 1550MB 106.6MB/s   00:14    
ojiro_T2_2.fq.gz                                                                                                                                               100% 1618MB 107.8MB/s   00:15    
ojiro_T3_1.fq.gz                                                                                                                                               100% 1724MB 108.1MB/s   00:15    
ojiro_T3_2.fq.gz                                                                                                                                               100% 1796MB 107.5MB/s   00:16    
ojiro_T4_1.fq.gz                                                                                                                                               100% 1505MB 105.0MB/s   00:14    
ojiro_T4_2.fq.gz                                                                                                                                               100% 1563MB 106.8MB/s   00:14    
ojiro_male_1.fq.gz                                                                                                                                             100% 1408MB 108.1MB/s   00:13    
ojiro_male_2.fq.gz                                                                                                                                             100% 1470MB 109.4MB/s   00:13    
:/Volumes/Elements_1/240529_RNAseq/RawData$ cd ../first_raw_read/
:/Volumes/Elements_1/240529_RNAseq/first_raw_read$ ls
femo-female_1.fastq.gz  femo-larva_1.fastq.gz   femo-male_1.fastq.gz    ojiro-female_1.fastq.gz ojiro-larva_1.fastq.gz  ojiro-male_1.fastq.gz   ojiro_femo.md5sum
femo-female_2.fastq.gz  femo-larva_2.fastq.gz   femo-male_2.fastq.gz    ojiro-female_2.fastq.gz ojiro-larva_2.fastq.gz  ojiro-male_2.fastq.gz
:/Volumes/Elements_1/240529_RNAseq/first_raw_read$ scp /Volumes/Elements_1/240529_RNAseq/first_raw_read/ojiro* kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_braker/nama_data/Ojiro_RNAseq
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
|  ..o.o...*   o+ |
|   . . ..= + o* o|
|       .  = oB +.|
|      +.oo .+E+o.|
|      .*S.  o.o.+|
|      .o. .  . .+|
|      ..   +  . o|
|        . ..oo . |
|         . .=.   |
+----[SHA256]-----+
ojiro-female_1.fastq.gz                                                                                                                                        100% 1049MB  89.2MB/s   00:11    
ojiro-female_2.fastq.gz                                                                                                                                        100% 1108MB 109.2MB/s   00:10    
ojiro-larva_1.fastq.gz                                                                                                                                         100% 1310MB 109.9MB/s   00:11    
ojiro-larva_2.fastq.gz                                                                                                                                         100% 1350MB  86.9MB/s   00:15    
ojiro-male_1.fastq.gz                                                                                                                                          100% 1022MB 105.2MB/s   00:09    
ojiro-male_2.fastq.gz                                                                                                                                          100% 1059MB 100.6MB/s   00:10    
ojiro_femo.md5sum                                                                                                                                              100%  810    81.2KB/s   00:00    
:/Volumes/Elements_1/240529_RNAseq/first_raw_read$ 

遺伝研にて解凍。これめちゃくちゃ時間かかる……。

kosukesano@at138:~/tools/for_braker/nama_data/Ojiro_RNAseq$ ls
ojiro-female_1.fastq.gz  ojiro-male_2.fastq.gz  ojiro_E3_1.fq.gz  ojiro_H1_2.fq.gz  ojiro_H4_1.fq.gz  ojiro_L2_2.fq.gz  ojiro_O1_1.fq.gz  ojiro_O3_2.fq.gz  ojiro_T2_1.fq.gz  ojiro_T4_2.fq.gz
ojiro-female_2.fastq.gz  ojiro_E1_1.fq.gz       ojiro_E3_2.fq.gz  ojiro_H2_1.fq.gz  ojiro_H4_2.fq.gz  ojiro_L3_1.fq.gz  ojiro_O1_2.fq.gz  ojiro_O4_1.fq.gz  ojiro_T2_2.fq.gz  ojiro_femo.md5sum
ojiro-larva_1.fastq.gz   ojiro_E1_2.fq.gz       ojiro_E4_1.fq.gz  ojiro_H2_2.fq.gz  ojiro_L1_1.fq.gz  ojiro_L3_2.fq.gz  ojiro_O2_1.fq.gz  ojiro_O4_2.fq.gz  ojiro_T3_1.fq.gz  ojiro_male_1.fq.gz
ojiro-larva_2.fastq.gz   ojiro_E2_1.fq.gz       ojiro_E4_2.fq.gz  ojiro_H3_1.fq.gz  ojiro_L1_2.fq.gz  ojiro_L4_1.fq.gz  ojiro_O2_2.fq.gz  ojiro_T1_1.fq.gz  ojiro_T3_2.fq.gz  ojiro_male_2.fq.gz
ojiro-male_1.fastq.gz    ojiro_E2_2.fq.gz       ojiro_H1_1.fq.gz  ojiro_H3_2.fq.gz  ojiro_L2_1.fq.gz  ojiro_L4_2.fq.gz  ojiro_O3_1.fq.gz  ojiro_T1_2.fq.gz  ojiro_T4_1.fq.gz
kosukesano@at138:~/tools/for_braker/nama_data/Ojiro_RNAseq$ rm ojiro_femo.md5sum 
kosukesano@at138:~/tools/for_braker/nama_data/Ojiro_RNAseq$ ls
ojiro-female_1.fastq.gz  ojiro-male_2.fastq.gz  ojiro_E3_1.fq.gz  ojiro_H1_2.fq.gz  ojiro_H4_1.fq.gz  ojiro_L2_2.fq.gz  ojiro_O1_1.fq.gz  ojiro_O3_2.fq.gz  ojiro_T2_1.fq.gz  ojiro_T4_2.fq.gz
ojiro-female_2.fastq.gz  ojiro_E1_1.fq.gz       ojiro_E3_2.fq.gz  ojiro_H2_1.fq.gz  ojiro_H4_2.fq.gz  ojiro_L3_1.fq.gz  ojiro_O1_2.fq.gz  ojiro_O4_1.fq.gz  ojiro_T2_2.fq.gz  ojiro_male_1.fq.gz
ojiro-larva_1.fastq.gz   ojiro_E1_2.fq.gz       ojiro_E4_1.fq.gz  ojiro_H2_2.fq.gz  ojiro_L1_1.fq.gz  ojiro_L3_2.fq.gz  ojiro_O2_1.fq.gz  ojiro_O4_2.fq.gz  ojiro_T3_1.fq.gz  ojiro_male_2.fq.gz
ojiro-larva_2.fastq.gz   ojiro_E2_1.fq.gz       ojiro_E4_2.fq.gz  ojiro_H3_1.fq.gz  ojiro_L1_2.fq.gz  ojiro_L4_1.fq.gz  ojiro_O2_2.fq.gz  ojiro_T1_1.fq.gz  ojiro_T3_2.fq.gz
ojiro-male_1.fastq.gz    ojiro_E2_2.fq.gz       ojiro_H1_1.fq.gz  ojiro_H3_2.fq.gz  ojiro_L2_1.fq.gz  ojiro_L4_2.fq.gz  ojiro_O3_1.fq.gz  ojiro_T1_2.fq.gz  ojiro_T4_1.fq.gz
kosukesano@at138:~/tools/for_braker/nama_data/Ojiro_RNAseq$ gunzip -rf ../Ojiro_RNAseq/

遺伝研にて、オジロ用のBRAKERディレクトリを作った。

kosukesano@at138:~/tools/for_braker$ mkdir Ojiro
kosukesano@at138:~/tools/for_braker$ cd Ojiro
kosukesano@at138:~/tools/for_braker/Ojiro$ nano Ojiro_braker.sh

1018

オジロの BRAKER続き

~/tools/for_braker/Ojiroで以下のスクリプトを書き、qsubで投げた。

### Ojiro_braker.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 16
#$ -l s_vmem=12G
#$ -l mem_req=12G
echo start at
date

source /home/kosukesano/tools/pyenv_env/braker_profile

braker.pl --genome=/home/kosukesano/tools/for_braker/nama_data/241017_Ojiro_masked.fa\
        --prot_seq=/home/kosukesano/tools/Arthropoda.fa\
        --rnaseq_sets_ids=ojiro-female_1,ojiro-female_2,ojiro-male_1,ojiro-male_2,ojiro-larva_1,ojiro-larva_2,\
        ojiro_E1_1,ojiro_E1_2,ojiro_E2_1,ojiro_E2_2,ojiro_E3_1,ojiro_E3_2,ojiro_E4_1,ojiro_E4_2,\
        ojiro_H1_1,ojiro_H1_2,ojiro_H2_1,ojiro_H2_2,ojiro_H3_1,ojiro_H3_2,ojiro_H4_1,ojiro_H4_2,\
        ojiro_L1_1,ojiro_L1_2,ojiro_L2_1,ojiro_L2_2,ojiro_L3_1,ojiro_L3_2,ojiro_L4_1,ojiro_L4_2,\
        ojiro_O1_1,ojiro_O1_2,ojiro_O2_1,ojiro_O2_2,ojiro_O3_1,ojiro_O3_2,ojiro_O4_1,ojiro_O4_2,\
        ojiro_T1_1,ojiro_T1_2,ojiro_T2_1,ojiro_T2_2,ojiro_T3_1,ojiro_T3_2,ojiro_T4_1,ojiro_T4_2\
        --rnaseq_sets_dir=/home/kosukesano/tools/for_braker/nama_data/Ojiro_RNAseq\
        --threads=16\
        --species=Ojiro_241017\
        --AUGUSTUS_CONFIG_PATH=/usr/share/augustus/config\
        --AUGUSTUS_BIN_PATH=/usr/bin\
        --AUGUSTUS_SCRIPTS_PATH=/usr/share/augustus/scripts\
        --GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin\
        --PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin\
        --TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin

date
~

また、intelノードが混んでそうだったので、gpuノード用のディレクトリ~/tools/for_braker/Ojiro/gputestも作成、以下のスクリプトを書いた。

### GPU_Ojiro_braker.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 16

echo start at
date

source /home/kosukesano/tools/pyenv_env/braker_profile

braker.pl --genome=/home/kosukesano/tools/for_braker/nama_data/241017_Ojiro_masked.fa\
        --prot_seq=/home/kosukesano/tools/Arthropoda.fa\
        --rnaseq_sets_ids=ojiro-female_1,ojiro-female_2,ojiro-male_1,ojiro-male_2,ojiro-larva_1,ojiro-larva_2,\
        ojiro_E1_1,ojiro_E1_2,ojiro_E2_1,ojiro_E2_2,ojiro_E3_1,ojiro_E3_2,ojiro_E4_1,ojiro_E4_2,\
        ojiro_H1_1,ojiro_H1_2,ojiro_H2_1,ojiro_H2_2,ojiro_H3_1,ojiro_H3_2,ojiro_H4_1,ojiro_H4_2,\
        ojiro_L1_1,ojiro_L1_2,ojiro_L2_1,ojiro_L2_2,ojiro_L3_1,ojiro_L3_2,ojiro_L4_1,ojiro_L4_2,\
        ojiro_O1_1,ojiro_O1_2,ojiro_O2_1,ojiro_O2_2,ojiro_O3_1,ojiro_O3_2,ojiro_O4_1,ojiro_O4_2,\
        ojiro_T1_1,ojiro_T1_2,ojiro_T2_1,ojiro_T2_2,ojiro_T3_1,ojiro_T3_2,ojiro_T4_1,ojiro_T4_2\
        --rnaseq_sets_dir=/home/kosukesano/tools/for_braker/nama_data/Ojiro_RNAseq\
        --threads=16\
        --species=Ojiro_241017_GPU\
        --AUGUSTUS_CONFIG_PATH=/usr/share/augustus/config\
        --AUGUSTUS_BIN_PATH=/usr/bin\
        --AUGUSTUS_SCRIPTS_PATH=/usr/share/augustus/scripts\
        --GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin\
        --PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin\
        --TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin

date

PstrBRAKER

ジョブは終わったけど結果のファイルが出力されてないぞ!?

### Pstr_braker.sh.o27030241の中身

start at
Tue Oct 15 13:32:47 JST 2024
# Tue Oct 15 13:32:54 2024: Log information is stored in file /lustre7/home/kosukesano/tools/for_braker/Pstr/braker/braker.log
#*********
# WARNING: in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 1413
file /lustre7/home/kosukesano/tools/for_braker/Pstr/braker/genome.fa contains a highly fragmented assembly (84140 scaffolds). This may lead to problems when running AUGUSTUS via braker in parallelized mode. You set --threads=16. You should run braker.pl in linear mode on such genomes, though (--threads=1).
#*********

なんかコフキの時に見たエラーな気がする。とりあえずスレッド数を落としてもう一度行った。

1019

オジロのBRAKER終了!

gpuの方で行ったオジロのBRAKERが終了してた。

kosukesano@at138:~/tools/for_braker/Ojiro$ cd gputest/braker/
kosukesano@at138:~/tools/for_braker/Ojiro/gputest/braker$ ls
Augustus  GeneMark-ETP  braker.aa  braker.codingseq  braker.gtf  braker.log  errors  genome_header.map  hintsfile.gff  species  what-to-cite.txt

できたファイルはこんな感じ

kosukesano@at139:~/tools/for_braker/Ojiro/gputest/braker$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat braker.aa
file       format  type     num_seqs    sum_len  min_len  avg_len  max_len
braker.aa  FASTA   Protein    16,019  9,595,507        4      599   19,746
kosukesano@at139:~/tools/for_braker/Ojiro/gputest/braker$ 

オジロを含めた7種でのOrthofinder

まずオジロのヘッダーを修正する。修正用にbraker.aaOjiro.fastaとしてコピー。

kosukesano@at138:~/tools/for_braker/Ojiro/gputest/braker$ cp braker.aa Ojiro.fasta
kosukesano@at138:~/tools/for_braker/Ojiro/gputest/braker$ ls
Augustus  GeneMark-ETP  Ojiro.fasta  braker.aa  braker.codingseq  braker.gtf  braker.log  errors  genome_header.map  hintsfile.gff  species  what-to-cite.txt

続いて同じディレクトリでヘッダー書き換え用スクリプトedit.pyを作成。

### edit.pyの中身

### edit.pyの中身

import os
from Bio import SeqIO

# 入力ディレクトリと出力ディレクトリのパス
input_dir = '../braker'
output_dir = '../braker/RemakeHedder_Ojiro'

# 出力ディレクトリが存在しない場合は作成
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# 入力ディレクトリ内のすべての .fasta ファイルを処理
for input_file in os.listdir(input_dir):
    if input_file.endswith('.fasta'):
        input_path = os.path.join(input_dir, input_file)
        output_path = os.path.join(output_dir, input_file)

        # 入力ファイルを読み込み、条件に基づいて書き換えた内容を出力ファイルに保存
        with open(output_path, 'w') as outfile:
            for record in SeqIO.parse(input_path, 'fasta'):
                header = record.description
                seq = str(record.seq)

                # ヘッダーが「g」で始まる場合
                if header.startswith("g"):
                    # 新しいヘッダーは「>Ojir」 + 「元のヘッダーの番号」
                    number = header.split()[0]  # ヘッダーの最初の番号部分を取得
                    new_header = f">Ojir_{number}"

                # ヘッダーが「]」で終わる場合
                elif header.endswith("]"):
                    # ヘッダーの最後の「[]」内の英字を抽出
                    within_brackets = header.split('[')[-1].split(']')[0]
                    first_letter = within_brackets[0]  # 最初の1文字
                    space_after = within_brackets.split()[-1][:3]  # スペース後の3文字
                    
                    # 元のヘッダーから最初の「>」の次の文字から最初の「 」までの部分を取得
                    first_part = header.split()[0][1:]
                    new_header = f">{first_letter}{space_after}_{first_part}"

                else:
                    new_header = f">{header.split()[0]}"

                # 新しいヘッダーと配列を出力ファイルに書き込む
                outfile.write(f"{new_header}\n{seq}\n")

        print(f"{output_path} に保存しました。")

これを実行。

続いて~/tools/for_orthofinderディレクトリにて241019_6plusOjiroディレクトリを作成。ここにオジロや他6種のゲノムを持ってきた。

kosukesano@at139:~/tools/for_orthofinder/241019_6plusOjiro$ cp ~/tools/for_braker/Ojiro/gputest/braker/RemakeHedder_Ojiro/Ojiro.fasta ../241019_6plusOjiro/
kosukesano@at139:~/tools/for_orthofinder/241019_6plusOjiro$ ls
Ojiro.fasta
kosukesano@at139:~/tools/for_orthofinder/241019_6plusOjiro$ cp ../RemakeHedder_6sp/*.fasta ../241019_6plusOjiro/
kosukesano@at139:~/tools/for_orthofinder/241019_6plusOjiro$ ls
Agra.fasta  Cass.fasta  Dpon.fasta  Ojiro.fasta  Smad.fasta  Sory.fasta  Tcas.fasta
kosukesano@at139:~/tools/for_orthofinder/241019_6plusOjiro$

そして、~/tools/for_orthofinderディレクトリにてOrthofinder_241019.shを作成、qsubで投げた。

### Orthofinder_241019.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -pe def_slot 16
#$ -l intel
echo start at
date


singularity exec /usr/local/biotools/o/orthofinder:2.5.4--hdfd78af_0 orthofinder\
        -f /home/kosukesano/tools/for_orthofinder/241019_6plusOjiro\
        -t 16

date

ノードをintelに、スロットを16に設定。10分くらいで終わった。

出力はこう

kosukesano@at139:~/tools/for_orthofinder/241019_6plusOjiro/OrthoFinder/Results_Oct19$ ls
Citation.txt                     Gene_Trees            Orthogroups                            Phylogenetically_Misplaced_Genes  Single_Copy_Orthologue_Sequences
Comparative_Genomics_Statistics  Log.txt               Orthologues                            Putative_Xenologs                 Species_Tree
Gene_Duplication_Events          Orthogroup_Sequences  Phylogenetic_Hierarchical_Orthogroups  Resolved_Gene_Trees               WorkingDirectory
kosukesano@at139:~/tools/for_orthofinder/241019_6plusOjiro/OrthoFinder/Results_Oct19$ ls Orthogroups/
Orthogroups.GeneCount.tsv  Orthogroups.tsv  Orthogroups.txt  Orthogroups_SingleCopyOrthologues.txt  Orthogroups_UnassignedGenes.tsv

オジロゲノムのBUSCO

BRAKER終わったらやっとかなきゃね。

~/tools/for_braker/Ojiro/gputest/braker/RemakeHedder_OjiroディレクトリでOjiro_busco.shを作成、qsubで投げた。

### Ojiro_busco.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 16
echo start at
date


singularity exec -e /usr/local/biotools/b/busco:5.1.3--pyhdfd78af_0 busco\
        -m protein\
        -i /home/kosukesano/tools/for_braker/Ojiro/gputest/braker/RemakeHedder_Ojiro/Ojiro.fasta\
        -o /home/kosukesano/tools/for_braker/Ojiro/gputest/braker/BUSCO_OUTPUT_Ojiro\
        -l\
        /home/kosukesano/old_envilonment_until20240430/busco_downloads/busco_downloads/lineages/arthropoda_odb10/\
        -f

date

オジロゲノムを含めたPAML前準備、SCOの抽出

~/tools/for_paml/data/plusOjiroを作成し、ExOG.pyを書いた。

# ファイルパスの設定
orthogroups_file_path = '/home/kosukesano/tools/for_orthofinder/241019_6plusOjiro/OrthoFinder/Results_Oct19/Orthogroups/Orthogroups.txt'
single_copy_orthologues_file_path = '/home/kosukesano/tools/for_orthofinder/241019_6plusOjiro/OrthoFinder/Results_Oct19/Orthogroups/Orthogroups_SingleCopyOrthologues.txt'
output_file_path = '/home/kosukesano/tools/for_paml/data/plusOjiro/extracted_orthogroups.txt'

# シングルコピーオルソログのIDをセットに格納
single_copy_orthologues = set()
with open(single_copy_orthologues_file_path, 'r') as single_copy_file:
    for line in single_copy_file:
        single_copy_orthologues.add(line.strip())

# Orthogroups.txt から該当する行を抽出して新しいファイルに保存
with open(orthogroups_file_path, 'r') as orthogroups_file, open(output_file_path, 'w') as output_file:
    for line in orthogroups_file:
        # 行の最初の部分を取り出してIDをチェック
        og_id = line.split(':')[0].strip()
        if og_id in single_copy_orthologues:
            output_file.write(line)

これを実行するとextracted_orthogroups.txtができる

### extracted_orthogroups.txt

OG0008141: Agra_P_050292700.1 Cass_AG9761214.1 Dpon_P_019755574.2 Ojir_g1996.t1 Smad_g6358.t1 Sory_P_030761209.1 Tcas_P_008195282.1
OG0008142: Agra_P_050292731.1 Cass_AH1135743.1 Dpon_P_048519923.1 Ojir_g7978.t1 Smad_g2098.t1 Sory_P_030765758.1 Tcas_P_008196870.1
OG0008143: Agra_P_050292732.1 Cass_AG9767756.1 Dpon_P_019773495.1 Ojir_g6189.t1 Smad_g5269.t1 Sory_P_030765067.1 Tcas_P_015836383.1
OG0008144: Agra_P_050292739.1 Cass_AG9768060.1 Dpon_P_019769194.2 Ojir_g6137.t1 Smad_g11904.t1 Sory_P_030755089.1 Tcas_P_969265.1
OG0008145: Agra_P_050292743.1 Cass_AG9767942.1 Dpon_P_019767966.1 Ojir_g4737.t1 Smad_g4980.t1 Sory_P_030750408.1 Tcas_P_971491.1

続いて、ヘッダーがタンパク質のものと揃っているアミノ酸CDSのファイルを揃える。~/tools/for_paml/data/plusOjiro_nama_data/Amino_seqディレクトリを作成、かつて作った分のファイルをコピー。

kosukesano@at138:~/tools/for_paml/data/plusOjiro_nama_data$ mkdir Amino_seq
kosukesano@at138:~/tools/for_paml/data/plusOjiro_nama_data$ cd Amino_seq/
kosukesano@at138:~/tools/for_paml/data/plusOjiro_nama_data/Amino_seq$ ls
kosukesano@at138:~/tools/for_paml/data/plusOjiro_nama_data/Amino_seq$ cp ../../241009_RemakeHedder_6sp_afterchange/.fasta ../Amino_seq/
cp: cannot stat '../../241009_RemakeHedder_6sp_afterchange/.fasta': No such file or directory
kosukesano@at138:~/tools/for_paml/data/plusOjiro_nama_data/Amino_seq$ cp ../../241009_RemakeHedder_6sp_afterchange/*.fasta ../Amino_seq/
kosukesano@at138:~/tools/for_paml/data/plusOjiro_nama_data/Amino_seq$ ls
Agra_changehedder.fasta  Cass_changehedder.fasta  Dpon_changehedder.fasta  Smad_changehedder.fasta  Sory_changehedder.fasta  Tcas_changehedder.fasta

また、マダラもbraker.cordingseqedit.pyで処置してコピー。

kosukesano@at138:~/tools/for_paml/data/plusOjiro_nama_data/Amino_seq$ ls
Agra_changehedder.fasta  Cass_changehedder.fasta  Dpon_changehedder.fasta  Ojir_changehedder.fasta  Smad_changehedder.fasta  Sory_changehedder.fasta  Tcas_changehedder.fasta
kosukesano@at138:~/tools/for_paml/data/plusOjiro_nama_data/Amino_seq$ 

~/tools/for_paml/data/plusOjiroディレクトリに集約させた方が良いのでは?mvで移動させた。

kosukesano@at138:~/tools/for_paml/data/plusOjiro$ mv ../plusOjiro_nama_data/ ../plusOjiro
kosukesano@at138:~/tools/for_paml/data/plusOjiro$ ls
ExOG.py  extracted_orthogroups.txt  plusOjiro_nama_data

この後、new_makefna.pyを作成、実行した。

# 必要なモジュールをインポート
import os

# ファイルパスの設定
orthogroups_file = "extracted_orthogroups.txt"
input_dir = "/home/kosukesano/tools/for_paml/data/plusOjiro/plusOjiro_nama_data/Amino_seq"
output_dir = "/home/kosukesano/tools/for_paml/data/plusOjiro/CDS_SCO/"

# ディレクトリが存在しない場合、作成
os.makedirs(output_dir, exist_ok=True)

# OG番号と遺伝子IDをextracted_orthogroups.txtから取得
with open(orthogroups_file, "r") as ortho_f:
    for line in ortho_f:
        if line.strip():  # 空行を無視
            # 行をOG番号と遺伝子IDに分割
            og_number, gene_ids_str = line.split(":")
            og_number = og_number.strip()
            gene_ids = gene_ids_str.strip().split()

            # 遺伝子IDを種ごとに分割
            genes = {
                "Agra": gene_ids[0],
                "Cass": gene_ids[1],
                "Dpon": gene_ids[2],
                "Ojir": gene_ids[3],
                "Smad": gene_ids[4],
                "Sory": gene_ids[5],
                "Tcas": gene_ids[6]
            }

            # 出力ファイルのパスを設定
            output_file = os.path.join(output_dir, f"{og_number}.fna")

            # 出力ファイルを開く
            with open(output_file, "w") as out_f:
                # 各種ごとに遺伝子IDを取得し、対応するファイルからシーケンスを検索
                for species, gene_id in genes.items():
                    fasta_file = os.path.join(input_dir, f"{species}_changehedder.fasta")

                    with open(fasta_file, "r") as fasta_f:
                        write_flag = False
                        for line in fasta_f:
                            if line.startswith(f">{gene_id}"):
                                # ヘッダー行を見つけたら、出力ファイルに書き込みを開始
                                out_f.write(line)
                                print(line.strip())  # 標準出力にヘッダーを表示
                                write_flag = True
                            elif line.startswith(">") and write_flag:
                                # 次のヘッダー行が見つかったら、現在の遺伝子の書き込みを終了
                                write_flag = False
                            elif write_flag:
                                # シーケンス部分を書き込む
                                out_f.write(line)
                                print(line.strip())  # 標準出力にシーケンスを表示

            print(f"{og_number}.fna ファイルが {output_dir} に保存されました。")

出力

>Tcas_P_008199734.2
ATGGAAATCGAGAACAAATTAGACGAGGACTTCGTCTTCTACCTCGGCTTCGTCGGTACTTACTTCAAACATATCCGCGATAAAGACATTCGTCACCACTGCGAACAATGGTTGCTAAAACTCTGCGGGGAGCCTTGCCAAGGAATTGAAAAGAAACGAGGCCGCAATATCTACCTCTCACAACTCATTCTATGCATGCAAACTGGAATTTTGGGCAATGAATTTAAAGTTCCGGTCAACGAAGTCGATGTAGCGAATGCGACCCAGGTGTTCCAGCTGCAGCCCGAAGGAGAAGCATTTCAGACCCCAGGATGGTTGGAGGATAACGATGCTGATGTAGGTACTGCTGCCAGGAATGCAAAAACTGGGCGGACTTACGTGGCTACGCGTACATTGCCGGGAGGACAAGGGGCTTTTGCGTACGTTGCCGTCTCCTTAGACGAGGAAGAACCCAAGTGGTTGGGAGGCGGGGAAGGTGTTTTTGACCGGCATATGGAACAGAAGTTCAGGGAGGAGGTGCCCGATTATGAAATGGAAAAGATTCTAGCAAGGAGGAAAGATCCTAAAGAGCGGGAGAAGGTTATCACCTTCTATAAAGTCCTGTTGACAAATATTGAAGATGAGTTGGACGAGAAGATACATGCAGGTGAAAATGACACTGTTAATGGTCTTTTGGAGCAACTGGAACAAGATATGAGGGATCGTGGCCAGTTTGAACCATTCGCACACTTGAATGCTAAAGATTTAAGAAACGAACTTCTTCTAGTGCTACACGATCGCATTCAGCTAAGGATTAATAAAGTGATGAAACGTGAGGAACTTCTCGATGAAATTGAGAAAGGCATTCTTGCGAAATCATTCTTTGAAACCTCTGTAACGCCAGAAGACAAGTTCTTGTTACCCCCGGCAATGTGGGAGCAAGCTATCAATAAAATCCCCAACAAAAAACTGCTTGAGAAATTAAGGGATAACTATCCGATGATTCTAATAGAGAAATTCTTGAAGCTACTTTCTGATTATAAAGAAGAAATAGCTGTGAGAATGCACCGTCGACATGAAAACATCGCCGCGCAGATGAAGCGGGAGTTAAGACGTGAAGACGAGAAGGGGAAGAAGCTTGTCGAAGGTGCCCAAATCGCCTGCGACCACGCTACTGAGATTCTCAAGGCTGTCAAAGAAGCCTATACCACTAAGGCCGAAGTCGAGAGGAGAAATGCAGAGAAGGTTGCCATTCCAAAATCGGAGCATTCTGAGCTTTATGATCAAATGAGGGCCGCTTTGCTTGACACTCAGAAGTCTGTCGAGGATGAAGCGGCCAGAGGAAAAGTGTTGGCTGCTCAAATTGGAGAAATCAACGAACAGACTGAAATGTGTTTGAAAGTAACGGAAGAAAATGTCAGGAAGATTGAAGAGAAGAATATGGAAATAATGAAAAATATCAAGAGACTGAATGCCGCAATTGACAATCAGCAGAAGAGGATCGAAATGGTGCAGAAGGTGGGGGCGAAGAAGGGAAATCAGCTTGAATTTTTCTTTTAA
OG0009886.fna ファイルが /home/kosukesano/tools/for_paml/data/plusOjiro/CDS_SCO/ に保存されました。
kosukesano@at138:~/tools/for_paml/data/plusOjiro$ ls
CDS_SCO  ExOG.py  extracted_orthogroups.txt  new_makefna.py  plusOjiro_nama_data
kosukesano@at138:~/tools/for_paml/data/plusOjiro$ ls CDS_SCO/
OG0008141.fna  OG0008289.fna  OG0008453.fna  OG0008599.fna  OG0008749.fna  OG0008891.fna  OG0009031.fna  OG0009169.fna  OG0009312.fna  OG0009459.fna  OG0009598.fna  OG0009748.fna
OG0008142.fna  OG0008290.fna  OG0008454.fna  OG0008600.fna  OG0008750.fna  OG0008892.fna  OG0009032.fna  OG0009171.fna  OG0009314.fna  OG0009460.fna  OG0009599.fna  OG0009749.fna
OG0008143.fna  OG0008292.fna  OG0008457.fna  OG0008601.fna  OG0008752.fna  OG0008893.fna  OG0009033.fna  OG0009172.fna  OG0009315.fna  OG0009462.fna  OG0009601.fna  OG0009751.fna

これにMAFFTをかける。~/tools/for_paml/data/plusOjiroディレクトリでmafft.shを作成、qsubで実行。

#$ -S /bin/bash

source ~/tools/pyenv_env/ManualPhilo_profile

# ディレクトリパス
input_dir="/home/kosukesano/tools/for_paml/data/plusOjiro/CDS_SCO/"
output_dir="/home/kosukesano/tools/for_paml/data/plusOjiro/CDS_SCO/"

# 各ファイルに対してアラインメントを実行
for file in "$input_dir"*.fna; do
  # 元のファイル名から拡張子を除いたものを取得
  base_name=$(basename "$file" .fna)

  # 出力ファイル名を生成
  output_file="${output_dir}${base_name}_maffted.fna"

  # MAFFTを実行
  mafft --auto --maxiterate 1000 --localpair "$file" > "$output_file"

  echo "Aligned file created: $output_file"
done

オジロゲノムを含めたCAFE前準備

Orthogroups.GeneCount.tsvSpeciesTree_rooted.txtをローカルにコピー。

:~/bio/for_cafe/241019_orthofinder_data$ scp kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_orthofinder/241019_6plusOjiro/OrthoFinder/Results_Oct19/Orthogroups/Orthogroups.GeneCount.tsv /Users/kosukesano/bio/for_cafe/241019_orthofinder_data
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
|  ..o.o...*   o+ |
|   . . ..= + o* o|
|       .  = oB +.|
|      +.oo .+E+o.|
|      .*S.  o.o.+|
|      .o. .  . .+|
|      ..   +  . o|
|        . ..oo . |
|         . .=.   |
+----[SHA256]-----+
Orthogroups.GeneCount.tsv                                                                                                                                      100%  387KB   4.1MB/s   00:00    
:~/bio/for_cafe/241019_orthofinder_data$ scp kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_orthofinder/241019_6plusOjiro/OrthoFinder/Results_Oct19/Species_Tree/SpeciesTree_rooted.txt /Users/kosukesano/bio/for_cafe/241019_orthofinder_data
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
|  ..o.o...*   o+ |
|   . . ..= + o* o|
|       .  = oB +.|
|      +.oo .+E+o.|
|      .*S.  o.o.+|
|      .o. .  . .+|
|      ..   +  . o|
|        . ..oo . |
|         . .=.   |
+----[SHA256]-----+
SpeciesTree_rooted.txt                                                                                                                                         100%  189    10.2KB/s   00:00    
:~/bio/for_cafe/241019_orthofinder_data$ ls
Orthogroups.GeneCount.tsv SpeciesTree_rooted.txt
:~/bio/for_cafe/241019_orthofinder_data$ 

~/bio/for_cafe/にて241019_cafe前処理.Rを作成、実行した。

Orthologs_raw <- read_tsv(paste("/Users/kosukesano/bio/for_cafe/241019_orthofinder_data/Orthogroups.GeneCount.tsv", sep = "/"))

##Enzanはorthogroupのなかで遺伝子数が変なやつを検出するためのmatrix
Enzan <- Orthologs_raw %>%
  select(!c(Orthogroup, Total)) %>%
  t()

##saidai, saisyouは各Orthogroupの中で、各種が持っているコピー数の最大値及び最小値を記したdf
saidai <- Enzan %>% 
  apply(2, max) %>%
  as.data.frame() %>%
  rename(max_real = ".")
saisyou <- Enzan %>% 
  apply(2, min) %>%
  as.data.frame() %>%
  rename(min_real = ".")

##Orthologs_1は各Orthogroupsの最大値、最小値もくっつけたdf
Orthologs_1 <- Orthologs_raw %>% select(!c(Total)) %>%
  bind_cols(saidai, saisyou)

##最大値と最小値の差
Orthologs_2 <-Orthologs_1 %>% 
  mutate(sa = max_real - min_real) %>%
  filter(max_real != min_real) %>%
  filter(sa < 50)

##外れ値と遺伝子ファミリー数が全種で共通の行を省いた。最後に1列目を複製し列名をいじって、CAFEへのインプットデータの出来上がり。
Orthologs_3 <- Orthologs_2 %>% 
  mutate(Description = Orthogroup, ID = Orthogroup) %>%
  relocate(Description, ID) %>%
  select(!c(Orthogroup, max_real, min_real, sa))

#Orthologs_3 %>% 
#  write_tsv(paste("/Users/kosukesano/bio/for_cafe/241019_orthofinder_data/Orthogroups.GeneCount2.tsv", sep = "/"))#, quote = FALSE) #,row.names = FALSE)
##Did you finish creating ultrametric tree with makeultrametric.R?

############

tree = read.tree("/Users/kosukesano/bio/for_cafe/241019_orthofinder_data/SpeciesTree_rooted.txt")
mrca = getMRCA(tree, tip=c('Tcas', 'Sory')) #分岐年代推定に使うノードの指定
tree2 = chronopl(
  tree,
  100000,
  age.min = 152.3,  # 推定分岐年代の最小値(MYA)
  age.max = 236.2,  # 推定分岐年代の最大値(MYA)
  node = mrca,   # getMRCAで指定したノード
  S = 1,
  tol = 1e-20,
  CV = FALSE,
  eval.max = 500,
  iter.max = 500
)
is.ultrametric(tree2)  # ultrametricかどうか確認
[1] TRUE
#write.tree(tree2, file = "/Users/kosukesano/bio/for_cafe/241019_orthofinder_data/tree_ultrametric.nwk")  # ultrametric系統樹の保存

こうしてできたOrthogroups.GeneCount2.tsvtree_ultrametric.nwkをDDBJの~/tools/for_cafe/241019_plusOjiroに転送した。

:~/bio/for_cafe/241019_orthofinder_data$ scp /Users/kosukesano/bio/for_cafe/241019_orthofinder_data/Orthogroups.GeneCount2.tsv kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_cafe/241019_plusOjiro
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
|  ..o.o...*   o+ |
|   . . ..= + o* o|
|       .  = oB +.|
|      +.oo .+E+o.|
|      .*S.  o.o.+|
|      .o. .  . .+|
|      ..   +  . o|
|        . ..oo . |
|         . .=.   |
+----[SHA256]-----+
Orthogroups.GeneCount2.tsv                                                                                                                                     100%  438KB   7.0MB/s   00:00    
:~/bio/for_cafe/241019_orthofinder_data$ scp /Users/kosukesano/bio/for_cafe/241019_orthofinder_data/tree_ultrametric.nwk kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_cafe/241019_plusOjiro
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
|  ..o.o...*   o+ |
|   . . ..= + o* o|
|       .  = oB +.|
|      +.oo .+E+o.|
|      .*S.  o.o.+|
|      .o. .  . .+|
|      ..   +  . o|
|        . ..oo . |
|         . .=.   |
+----[SHA256]-----+
tree_ultrametric.nwk                                                                                                                                           100%  217    15.9KB/s   00:00    
:~/bio/for_cafe/241019_orthofinder_data$ 

これを用いてCAFE5を実行

kosukesano@at139:~/tools/for_cafe/241019_plusOjiro$ singularity exec -e /usr/local/biotools/c/cafe:5.0.0--h5b5514e_2 cafe5 -i Orthogroups.GeneCount2.tsv -t tree_ultrametric.nwk

Command line: /usr/local/bin/cafe5 -i Orthogroups.GeneCount2.tsv -t tree_ultrametric.nwk 

Filtering families not present at the root from: 13167 to 8317

No root family size distribution specified, using uniform distribution

Optimizer strategy: Nelder-Mead with similarity cutoff
Iterations: 300
Expansion: 2
Reflection: 1

Starting Search for Initial Parameter Values
Lambda: 0.0019787717120458
Score (-lnL): 125535.74185327
Lambda: 0.0019787717120458
Score (-lnL): 125535.74185327
Lambda: 0.0020777102976481
Score (-lnL): 125409.06131684
Lambda: 0.0021766488832504
Score (-lnL): 125339.43286286
Lambda: 0.0022755874688527
Score (-lnL): 125320.25002694
Lambda: 0.0024734646400572
Score (-lnL): 125410.58799608
Lambda: 0.0023745260544549
Score (-lnL): 125345.70788451
Lambda: 0.0021766488832504
Score (-lnL): 125339.43286286
Lambda: 0.0023745260544549
Score (-lnL): 125345.70788451
Lambda: 0.0023250567616538
Score (-lnL):  125327.7598648
Lambda: 0.0022261181760515
Score (-lnL):   125323.898933
Lambda: 0.0023250567616538
Score (-lnL):  125327.7598648
Lambda: 0.0023003221152532
Score (-lnL): 125322.66492998
Lambda: 0.0022508528224521
Score (-lnL): 125320.63456673
Lambda: 0.0023003221152532
Score (-lnL): 125322.66492998
Lambda: 0.0022879547920529
Score (-lnL): 125321.11212858
Lambda: 0.0022632201456524
Score (-lnL):  125320.0882598
Lambda: 0.0022508528224521
Score (-lnL): 125320.63456673
Lambda: 0.0022570364840522
Score (-lnL): 125320.27259376
Lambda: 0.0022694038072525
Score (-lnL): 125320.08098691
Lambda: 0.0022755874688527
Score (-lnL): 125320.25002694
Lambda: 0.0022724956380526
Score (-lnL):   125320.143545
Lambda: 0.0022663119764524
Score (-lnL): 125320.06260189
Lambda: 0.0022632201456524
Score (-lnL):  125320.0882598
Lambda: 0.0022647660610524
Score (-lnL): 125320.06989495
Lambda: 0.0022678578918525
Score (-lnL):  125320.0662948
Lambda: 0.0022647660610524
Score (-lnL): 125320.06989495
Lambda: 0.0022655390187524
Score (-lnL):  125320.0648584
Lambda: 0.0022670849341525
Score (-lnL): 125320.06310437
Lambda: 0.0022655390187524
Score (-lnL):  125320.0648584
Lambda: 0.0022659254976024
Score (-lnL): 125320.06338383
Lambda: 0.0022666984553025
Score (-lnL):  125320.0625129
Lambda: 0.0022670849341525
Score (-lnL): 125320.06310437
Lambda: 0.0022668916947275
Score (-lnL): 125320.06272849
Lambda: 0.0022665052158774
Score (-lnL): 125320.06247031
Lambda: 0.0022663119764524
Score (-lnL): 125320.06260189
Lambda: 0.0022664085961649
Score (-lnL): 125320.06251424
Lambda: 0.00226660183559
Score (-lnL): 125320.06246987

Completed 17 iterations
Time: 0H 0M 3S
Best match is: 0.00226660183559
Final -lnL: 125320.06246987

38 values were attempted (0% rejected)

Inferring processes for Base model
Score (-lnL): 125320.06246987
Maximum possible lambda for this topology: 0.004233700254022
Computing pvalues...
done!

Starting reconstruction processes for Base model
Done!

kosukesano@at139:~/tools/for_cafe/241019_plusOjiro$ 

結果はこんな感じ

kosukesano@at139:~/tools/for_cafe/241019_plusOjiro$ singularity exec -e /usr/local/biotools/c/cafe:5.0.0--h5b5514e_2 cafe5 -i Orthogroups.GeneCount2.tsv -t tree_ultrametric.nwk

Command line: /usr/local/bin/cafe5 -i Orthogroups.GeneCount2.tsv -t tree_ultrametric.nwk 

Filtering families not present at the root from: 13167 to 8317

No root family size distribution specified, using uniform distribution

Optimizer strategy: Nelder-Mead with similarity cutoff
Iterations: 300
Expansion: 2
Reflection: 1

Starting Search for Initial Parameter Values
Lambda: 0.0019787717120458
Score (-lnL): 125535.74185327
Lambda: 0.0019787717120458
Score (-lnL): 125535.74185327
Lambda: 0.0020777102976481
Score (-lnL): 125409.06131684
Lambda: 0.0021766488832504
Score (-lnL): 125339.43286286
Lambda: 0.0022755874688527
Score (-lnL): 125320.25002694
Lambda: 0.0024734646400572
Score (-lnL): 125410.58799608
Lambda: 0.0023745260544549
Score (-lnL): 125345.70788451
Lambda: 0.0021766488832504
Score (-lnL): 125339.43286286
Lambda: 0.0023745260544549
Score (-lnL): 125345.70788451
Lambda: 0.0023250567616538
Score (-lnL):  125327.7598648
Lambda: 0.0022261181760515
Score (-lnL):   125323.898933
Lambda: 0.0023250567616538
Score (-lnL):  125327.7598648
Lambda: 0.0023003221152532
Score (-lnL): 125322.66492998
Lambda: 0.0022508528224521
Score (-lnL): 125320.63456673
Lambda: 0.0023003221152532
Score (-lnL): 125322.66492998
Lambda: 0.0022879547920529
Score (-lnL): 125321.11212858
Lambda: 0.0022632201456524
Score (-lnL):  125320.0882598
Lambda: 0.0022508528224521
Score (-lnL): 125320.63456673
Lambda: 0.0022570364840522
Score (-lnL): 125320.27259376
Lambda: 0.0022694038072525
Score (-lnL): 125320.08098691
Lambda: 0.0022755874688527
Score (-lnL): 125320.25002694
Lambda: 0.0022724956380526
Score (-lnL):   125320.143545
Lambda: 0.0022663119764524
Score (-lnL): 125320.06260189
Lambda: 0.0022632201456524
Score (-lnL):  125320.0882598
Lambda: 0.0022647660610524
Score (-lnL): 125320.06989495
Lambda: 0.0022678578918525
Score (-lnL):  125320.0662948
Lambda: 0.0022647660610524
Score (-lnL): 125320.06989495
Lambda: 0.0022655390187524
Score (-lnL):  125320.0648584
Lambda: 0.0022670849341525
Score (-lnL): 125320.06310437
Lambda: 0.0022655390187524
Score (-lnL):  125320.0648584
Lambda: 0.0022659254976024
Score (-lnL): 125320.06338383
Lambda: 0.0022666984553025
Score (-lnL):  125320.0625129
Lambda: 0.0022670849341525
Score (-lnL): 125320.06310437
Lambda: 0.0022668916947275
Score (-lnL): 125320.06272849
Lambda: 0.0022665052158774
Score (-lnL): 125320.06247031
Lambda: 0.0022663119764524
Score (-lnL): 125320.06260189
Lambda: 0.0022664085961649
Score (-lnL): 125320.06251424
Lambda: 0.00226660183559
Score (-lnL): 125320.06246987

Completed 17 iterations
Time: 0H 0M 3S
Best match is: 0.00226660183559
Final -lnL: 125320.06246987

38 values were attempted (0% rejected)

Inferring processes for Base model
Score (-lnL): 125320.06246987
Maximum possible lambda for this topology: 0.004233700254022
Computing pvalues...
done!

Starting reconstruction processes for Base model
Done!

kosukesano@at139:~/tools/for_cafe/241019_plusOjiro$ 

マダラゲノムを使用した系統樹作成

~/tools/for_IQTREE/241019_plusOjiroディレクトリでIQTREE_1.pyを作成、実行した。

### IQTREE_1.py

##analysis_manual.pptxの#46も参照

##AFTER you made MSA file(all_seq.fa) in DDBJ with makeMSA.sh

##時間は10secほど

import numpy as np
import pandas as pd
import os

path = "/home/kosukesano/tools/for_orthofinder/241019_6plusOjiro/OrthoFinder/Results_Oct19/"
withpath = "../../for_orthofinder/241019_6plusOjiro/OrthoFinder/Results_Oct19/"

OGs = pd.read_table(path + "Orthogroups/Orthogroups.tsv")

# ManualPhylo_dataディレクトリが存在しない場合は作成
os.makedirs(path + "ManualPhylo_data", exist_ok=True)

##with openは相対パスしか受け付けないらしい
new = pd.DataFrame()
with open(withpath + "Orthogroups/Orthogroups_SingleCopyOrthologues.txt", "r") as fin:
    for line in fin:
        li = line.rstrip()
        new = pd.concat([new, OGs[OGs["Orthogroup"] == li]])
print(new)
new.to_csv(path + "ManualPhylo_data/OG_list.txt", sep = " ", index = False, header = False)

##OG_list.txtと同じ順番の種名リストであるspecies_list.txtを作成
##できたOG_list.txtに、DDBJで作ったall_seq.faで配列情報を与える。

li = []
allspe = OGs.columns.tolist()
allspe2 = allspe[1:len(allspe)]
with open(withpath + "ManualPhylo_data/species_list.txt", "w") as file:
   for column_name in allspe2:
       file.write("%s\n" % column_name)

続いてconcatinate.shを作成、実行した。

#$ -S /bin/bash
#$ -cwd

echo start at
date

# Enter the directory containing the fasta files
filesout="/home/kosukesano/tools/for_orthofinder/241019_6plusOjiro"  ## Please replace with the actual directory containing the fasta files

# Define the output directory and output file
new="/home/kosukesano/tools/for_IQTREE/241019_plusOjiro"
mkdir -p $new

# Concatenate all fasta files into one file
for file in "$filesout"/*.fasta; do
    cat "$file" >> "${new}/all_seq.fa"
done


date

これによりall_seq.faができた。

次にIQTREE_2.pyを作り、実行。

### IQTREE_2.pyの中身

import sys
from Bio import SeqIO

path = "../../for_orthofinder/241019_6plusOjiro/OrthoFinder/Results_Oct19/ManualPhylo_data/"

fasta_in = sys.argv[1]                                  #1番目の引数には上記のall_seq.faなどfastaファイルを指定する
query_in = sys.argv[2]                                  #2番目の引数には上記のOG_list.txtなどオーソログファイルを指定する

for q in open(query_in, "r"):                                           #オーソログファイルを開いて1行づつ読み込む
        query = q.split()                                                       #スペース毎に切りとってリスト形式でqueryに保存する
        f = open(path + query[0], 'w')                                  #最初の列(OG名)と同じ名前のファイルを作成する
        for record in SeqIO.parse(fasta_in, 'fasta'):   #fastaファイルを開くSeqIOを使ってパースする(1項目づつ読み込む)
                id_part = record.id                                     #fastaのID部分を読み込む
                desc_part = record.description                  #fastaのdescription部分を読み込む
                seq = record.seq                                        #fastaの配列部分を読み込む
                for i in range(len(query)):                         #オーソログファイル中の各OGに含まれる配列数を数えて、その分繰り返す(python2の人はrange を x rangeにする)
                        if desc_part == query[i] :                  #オーソログファイルの配列descriptionとfastaの配列descriptionが一致したら、、、
                                fasta_seq = '>' + desc_part + '\n' + seq + '\n'         #fasta形式に整え
                                print(fasta_seq)                                        #標準出力にfastaを出力(進行状況把握用)
                                f.write(str(fasta_seq))                             #各OGファイルにfastaを出力
        f.close()

実行のコマンドは以下

python IQTREE_2.py all_seq.fa ../../for_orthofinder/241019_6plusOjiro/OrthoFinder/Results_Oct19/ManualPhylo_data/OG_list.txt 

結構時間かかる

1020

~/tools/for_orthofinder/241019_6plusOjiro/OrthoFinder/Results_Oct19/ManualPhylo_dataalign.shを作成した。

### align.shの中身

#!/bin/sh
#$ -S /bin/bash
#$ -cwd
#$ -v PATH
awk '{print($1)}' $1 | while read x; do #引数に前述のOG_list.txtなどのOGリストを指定する。
    mafft --auto $x > $x.maffted.fa
    trimal -in $x.maffted.fa -out $x.maffted.trimed.fa -keepheader -htmlout $x.maffted.trimed.fa.html -automated1
done

以下のコマンドで実行

(MPT) kosukesano@at138:~/tools/for_orthofinder/241019_6plusOjiro/OrthoFinder/Results_Oct19/ManualPhylo_data$ sh align.sh OG_list.txt 

これにより*.maffted.fa*.maffted.trimed.faができた。

続いて、同じディレクトリでmakerun.pyを作る。

### makerun.pyの中身

import glob
import os

list = []
for i in glob.glob('*.maffted.trimed.fa'):
        list.append(os.path.split(i)[1].rstrip())

#print(list[0])


##ls | grep "maffted.trimed.edit.fa" > otamesi.txtで、完成したOGをotamesi.txtに一行ずつ保存
##ファイルの行数をカウント。このカウント数がfor文のrangeに入る数になる

f = open("run.nex", "w")
f.write("#nexus" + "\n")
f.write("begin sets;" + "\n")
character = "charset part"
for line, i in zip(list, range(4997)):
        row = character + str(i+1) + " = " + line + ": ;"
        f.write("\t" + row + "\n")
f.write("end;" + "\n")
f.close()

これでrun.nexが出力される。

続いてIQ-TREEの実行。使ったシェルスクリプトはmanualphylo.sh

### manualphylo.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 16

date
singularity exec -e /usr/local/biotools/i/iqtree:2.3.3--h21ec9f0_0 iqtree2 -sp run.nex -nt AUTO -bb 1000 -cptime 600
date

ASTRALの実行

同じディレクトリでmakealltree.shを書いた。

### makealltree.sh


#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 16
echo start at
date

# Singularityイメージのパスを指定
SINGULARITY_IMAGE="/usr/local/biotools/i/iqtree:2.3.3--h21ec9f0_0"

# 作業ディレクトリに移動
cd ~/tools/for_orthofinder/241019_6plusOjiro/OrthoFinder/Results_Oct19/ManualPhylo_data

# 出力ファイル
output_file="all_trees.nwk"

# 既存の出力ファイルを削除
if [ -f $output_file ]; then
    rm $output_file
fi

# *.maffted.trimed.edit.fa ファイルを処理
for file in *.maffted.trimed.fa; do
    # ファイル名から拡張子を除いたベース名を取得
    base_name=$(basename $file .maffted.trimed.fa)

    # Singularityを使用してIQ-TREEを実行して系統樹を作成
    singularity exec -e $SINGULARITY_IMAGE iqtree2 -s $file -nt AUTO -bb 1000 -cptime 600 -pre ${base_name}

    # 作成された系統樹ファイル (.treefile) を output_file に追加
    if [ -f ${base_name}.treefile ]; then
        echo -n "${base_name}: " >> $output_file
        cat ${base_name}.treefile >> $output_file
        echo "" >> $output_file
    else
        echo "Error: ${base_name}.treefile not found" >&2
    fi
done

echo "All trees have been written to $output_file"

date

これをqsubで投げた。

branchモデルの解釈

A_FN=read.csv("/Users/kosukesano/bio/for_paml/241019_branch/ASTRAL_free_vs_neut_lrt_results_with_w_ratios.txt", sep="\t")|>
  rename(A_FN_OG_num = OG_num, A_FN_p_val = p_val, A_FN_significant = positive_selection, A_FN_w_ratio=Smad_w_ratio)|>
  dplyr::filter(A_FN_significant == "+") |>
  dplyr::filter(A_FN_w_ratio != "None") 

A_FM=read.csv("/Users/kosukesano/bio/for_paml/241019_branch/ASTRAL_free_vs_M0_lrt_results_with_w_ratios.txt", sep="\t")|>
  rename(A_FN_OG_num = OG_num, A_FM_p_val = p_val, A_FM_significant = positive_selection, A_FM_w_ratio=Smad_w_ratio)|>
  dplyr::filter(A_FM_significant == "+") |>
  dplyr::filter(A_FM_w_ratio != "None")

A_branch=dplyr::full_join(A_FN, A_FM, by  = "A_FN_OG_num")
# 1018行

I_FN=read.csv("/Users/kosukesano/bio/for_paml/241019_branch/IQTREE_free_vs_neut_lrt_results_with_w_ratios.txt", sep="\t")|>
  rename(I_FN_OG_num = OG_num, I_FN_p_val = p_val, I_FN_significant = positive_selection, I_FN_w_ratio=Smad_w_ratio)|>
  dplyr::filter(I_FN_significant == "+") |>
  dplyr::filter(I_FN_w_ratio != "None") 

I_FM=read.csv("/Users/kosukesano/bio/for_paml/241019_branch/IQTREE_free_vs_M0_lrt_results_with_w_ratios.txt", sep="\t")|>
  rename(I_FN_OG_num = OG_num, I_FM_p_val = p_val, I_FM_significant = positive_selection, I_FM_w_ratio=Smad_w_ratio)|>
  dplyr::filter(I_FM_significant == "+") |>
  dplyr::filter(I_FM_w_ratio != "None") 

I_branch=dplyr::full_join(I_FN, I_FM, by  = "I_FN_OG_num")
# 905行

branch=dplyr::full_join(A_branch, I_branch, by = c(A_FN_OG_num = "I_FN_OG_num"))

orthogroups_file <- "/Users/kosukesano/bio/for_cafe/0930_orthofinder_data/Orthogroups.tsv"

orthogroups <- ### OG番号とそれに対応するマダラ遺伝子IDのファイル
  read.delim(orthogroups_file, header=FALSE, sep="\t", 
             #stringsAsFactors=FALSE,
             #col.names = "Data"
             skip=1
  )|>
  dplyr::select("V1", "V5")|>
  rename(gene_ID = V5)|>
  dplyr::mutate(gene_ID = stringr::str_replace(gene_ID, "^Smad_", "")) 

signal=read.csv("/Users/kosukesano/bio/out_madara_SP.txt", sep="\t")|>
  dplyr::select("X..SignalP.5.0", "Organism..Eukarya")|>
  rename(gene_ID = X..SignalP.5.0, signal_seq = Organism..Eukarya)

branch2=dplyr::left_join(branch, orthogroups, by = c(A_FN_OG_num = "V1"))|>
  dplyr::left_join(fa, by = c(gene_ID = "Madara"))|>
  dplyr::left_join(deg_all, by  = "gene_ID")|>
  dplyr::left_join(signal, by  = "gene_ID")

b_free VS b_neutの比較(特定の枝でdN/dSが1より大きいか)

    A_FN_p_val A_FN_w_ratio   gene_ID
1 2.292090e-04      2.83305 g10787.t1
2 4.515303e-14      1.89794  g9945.t1
                                                       Sory_GeneFunction
1 probable tRNA N6-adenosine threonylcarbamoyltransferase, mitochondrial
2                                                  laminin subunit alpha
  ovary.body_log2FC ovary.body_adjPval adult.llarva_log2FC adult.llarva_adjPval
1          2.542604           3.48e-17                  NA                   NA
2         -1.616277           9.89e-06                  NA                   NA
  adult.mlarva_log2FC adult.mlarva_adjPval  signal_seq
1                  NA                   NA       OTHER
2                  NA                   NA SP(Sec/SPI)

2つの遺伝子について、マダラの枝でdN/dSが1より大きい。これらはbranch_siteでも取れた遺伝子。IQTREEについても同様だった。

    A_FN_p_val A_FN_w_ratio   gene_ID
1 1.115680e-04   0.00416139  g6560.t1
2 1.775299e-10    0.0266867  g6092.t1
3 1.507048e-08    0.0288913  g1585.t1
4 3.564588e-02    0.0545569 g12688.t1
5 2.624653e-02    0.0694119 g11623.t1
6 2.751868e-04    0.0468634  g8108.t1
                                  Sory_GeneFunction ovary.body_log2FC
1                                     myosin-1-like        -2.8040404
2 U2 snRNP-associated SURP motif-containing protein                NA
3      conserved oligomeric Golgi complex subunit 6                NA
4 transmembrane emp24 domain-containing protein bai         0.5153544
5                          isocitrate dehydrogenase        -1.1550132
6                            adenosylhomocysteinase         1.4553454
  ovary.body_adjPval adult.llarva_log2FC adult.llarva_adjPval
1       7.398570e-04            2.381792          0.000534682
2                 NA                  NA                   NA
3                 NA                  NA                   NA
4       6.420052e-03                  NA                   NA
5       2.580000e-05                  NA                   NA
6       4.010000e-16                  NA                   NA
  adult.mlarva_log2FC adult.mlarva_adjPval  signal_seq
1            2.908176             2.28e-05       OTHER
2                  NA                   NA       OTHER
3                  NA                   NA       OTHER
4                  NA                   NA SP(Sec/SPI)
5                  NA                   NA       OTHER
6                  NA                   NA       OTHER

920つの遺伝子について、マダラの枝でdN/dSが1より小さい(はじめの6行のみ表示)。IQTREE版では958遺伝子で、ASTRALの結果を完全に包含する。

見たいのは小さい方だけど、ちょっと多いな……。

ShinyGOでGO解析してみる。

bn_neg_go=read.csv("/Users/kosukesano/bio/for_shinygo/241020_bn_neg_enrichment_all.csv", sep=",")|>
    print()
   Enrichment.FDR nGenes Pathway.Genes Fold.Enrichment
1    1.111133e-15     41           133        4.731786
2    5.275277e-06     16            52        4.722908
3    4.448801e-05    115          1151        1.533611
4    4.448801e-05     25           133        2.885235
5    5.147560e-04     21           119        2.708727
6    5.764535e-04     11            40        4.221099
7    1.320669e-03     10            37        4.148500
8    1.477654e-03     10            38        4.039329
9    3.373044e-03     19           121        2.410244
10   5.177854e-03      8            30        4.093187
11   9.279724e-03     18           123        2.246261
12   1.530829e-02     15            99        2.325674
13   1.847989e-02     13            82        2.433449
14   2.118504e-02      6            23        4.004204
15   2.188038e-02      7            31        3.466005
16   2.700478e-02     11            68        2.482999
17   4.775602e-02      4            13        4.722908
18   7.527918e-02     17           145        1.799591
19   7.932791e-02     12            91        2.024103
20   8.496575e-02      5            24        3.197802
21   8.496575e-02      6            33        2.790809
22   8.496575e-02      6            33        2.790809
23   9.341023e-02      6            34        2.708727
24   1.854880e-01     12           107        1.721434
25   1.854880e-01      4            21        2.923705
                                                                Pathway
1                                                Path:dme03010 Ribosome
2                                              Path:dme03050 Proteasome
3                                      Path:dme01100 Metabolic pathways
4             Path:dme04141 Protein processing in endoplasmic reticulum
5                                             Path:dme03040 Spliceosome
6                             Path:dme03022 Basal transcription factors
7                                   Path:dme00510 N-Glycan biosynthesis
8                              Path:dme03420 Nucleotide excision repair
9                                       Path:dme01200 Carbon metabolism
10                 Path:dme00513 Various types of N-glycan biosynthesis
11                                            Path:dme04144 Endocytosis
12                            Path:dme03013 Nucleocytoplasmic transport
13                      Path:dme03008 Ribosome biogenesis in eukaryotes
14                                         Path:dme03060 Protein export
15                                         Path:dme03020 RNA polymerase
16                            Path:dme01230 Biosynthesis of amino acids
17 Path:dme00534 Glycosaminoglycan biosynthesis-heparan sulfate/heparin
18                              Path:dme00190 Oxidative phosphorylation
19                                             Path:dme04146 Peroxisome
20                              Path:dme00030 Pentose phosphate pathway
21              Path:dme00280 Valine leucine and isoleucine degradation
22                                 Path:dme03250 Viral life cycle-HIV-1
23                                        Path:dme03030 DNA replication
24                         Path:dme04120 Ubiquitin mediated proteolysis
25              Path:dme04130 SNARE interactions in vesicular transport
                                                    URL
1  http://www.genome.jp/kegg-bin/show_pathway?dme03010 
2  http://www.genome.jp/kegg-bin/show_pathway?dme03050 
3  http://www.genome.jp/kegg-bin/show_pathway?dme01100 
4  http://www.genome.jp/kegg-bin/show_pathway?dme04141 
5  http://www.genome.jp/kegg-bin/show_pathway?dme03040 
6  http://www.genome.jp/kegg-bin/show_pathway?dme03022 
7  http://www.genome.jp/kegg-bin/show_pathway?dme00510 
8  http://www.genome.jp/kegg-bin/show_pathway?dme03420 
9  http://www.genome.jp/kegg-bin/show_pathway?dme01200 
10 http://www.genome.jp/kegg-bin/show_pathway?dme00513 
11 http://www.genome.jp/kegg-bin/show_pathway?dme04144 
12 http://www.genome.jp/kegg-bin/show_pathway?dme03013 
13 http://www.genome.jp/kegg-bin/show_pathway?dme03008 
14 http://www.genome.jp/kegg-bin/show_pathway?dme03060 
15 http://www.genome.jp/kegg-bin/show_pathway?dme03020 
16 http://www.genome.jp/kegg-bin/show_pathway?dme01230 
17 http://www.genome.jp/kegg-bin/show_pathway?dme00534 
18 http://www.genome.jp/kegg-bin/show_pathway?dme00190 
19 http://www.genome.jp/kegg-bin/show_pathway?dme04146 
20 http://www.genome.jp/kegg-bin/show_pathway?dme00030 
21 http://www.genome.jp/kegg-bin/show_pathway?dme00280 
22 http://www.genome.jp/kegg-bin/show_pathway?dme03250 
23 http://www.genome.jp/kegg-bin/show_pathway?dme03030 
24 http://www.genome.jp/kegg-bin/show_pathway?dme04120 
25 http://www.genome.jp/kegg-bin/show_pathway?dme04130 
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          Genes
1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       RpS26  mRpL13  mRpL20  RpL28  RpL21  mRpL23  RpS10b  RpL13A  RpL5  RpL10  RpS3A  mRpL9  RpL12  RpL7A  mRpL11  RpS9  RpL23  RpS17  RpS16  RpLP1  bonsai  RpL13  RpL7  mRpL12  mRpL15  RpL4  RpS2  RpL14  RpL18A  RpS25  RpL24-like  RpL26  RpL36A  RpL22  RpL36  mRpL2  tko  RpS15  RpL18  RpS11  mRpL21
2                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           Rpt5  Rpn5  Rpn2  Prosbeta7  Prosbeta5  Rpt3  Rpn11  CG30382  Prosbeta2  Rpn8  Rpt4  Prosbeta6  Rpn12  Rpn3  Prosalpha6  Prosalpha2
3   Prat2  ttv  CG10166  CG10425  amd  ScsbetaG  Ddc  Sc2  ATPsynbeta  Sirt7  Ahcy  beta4GalT7  Alg9  alpha-Man-Ib  PIG-B  nSMase  sro  PIG-C  ND-30  CG12338  ND-B14.5B  Alg2  PIG-U  S-Lap7  Sgsh  CG15093  botv  Stt3A  GCS1  CG1673  p23  Glo1  ScpX  CG17333  Eno  FIG4  CG18003  Alg1  Aprt  Gk1  FeCH  Pgant6  Gclc  CG2767  Taldo  Tdc2  Rpe  Prx6005  Pgk  fbp  Sirt4  ND-ASHI  Hsepi  Alg10  ND-MLRQ  Mccc2  AsnS  AdSL  ND-19  Pgd  Paics  Fum1  UQCR-11  UQCR-C2  Got2  Hmgs  Mipp2  CG44243  PIG-L  CTPsyn  Mtpbeta  ATPsynF  Agpat4  Cyt-c1  CG5009  Las  RnrL  Pi3K59F  Ppcs  Ppox  Men-b  P5cr  ND-39  ATPsynD  Dak1  rt  CG6218  OstDelta  GlcT  Idh3b  CG6638  Hacd1  CG6910  VhaPPA1-1  PyK  GlyP  CG7461  Pgant35A  Stt3B  Cds  Vha36-1  ATPsynB  Sps1  trx  CG8665  CG8745  Sply  Ost48  mtm  ND-51  ND-ACP  mAcon1  Amacr  AcCoAS  CG9886
4                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  CG10973  alpha-Man-Ib  Der-2  Stt3A  GCS1  Roc1a  Rad23  l(1)G0320  CG4603  Plap  Gint3  Gp93  CG5823  CG5885  OstDelta  Sec13  wbl  eff  Stt3B  Hsc70-2  CG7945  P58IPK  ERp60  Ost48  Calr
5                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          l(2)37Cb  PQBP1  CG17768  Hpr1  noi  l(1)G0007  hoip  snf  BCAS2  Prp19  CG6015  U4-U6-60K  CG6418  CG6841  Prp31  CG7483  Hsc70-2  Prp3  Bx42  snRNP-U1-70K  CG9346
6                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             TfIIEalpha  TfIIEbeta  Cdk7  Taf11  TfIIB  TfIIFbeta  Taf8  CycH  Mat1  hay  Tfb1
7                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    Alg9  alpha-Man-Ib  Alg2  Stt3A  GCS1  Alg1  Alg10  OstDelta  Stt3B  Ost48
8                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  Roc1a  Rad23  Cdk7  mei-9  CycH  Mat1  hay  Tfb1  PCNA  RPA2
9                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            ScsbetaG  CG15093  CG17333  Eno  CG18003  Taldo  Rpe  Pgk  fbp  Pgd  Fum1  Got2  CG5009  Men-b  Idh3b  PyK  mAcon1  AcCoAS  CG9886
10                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                Alg9  alpha-Man-Ib  Alg2  Stt3A  Alg1  OstDelta  Stt3B  Ost48
11                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 cpa  Vps36  Arpc2  Arf102F  Cdc42  Vps25  cpb  Snx1  Chmp1  Arpc4  Vps60  Snx3  AP-2mu  Hsc70-2  Vta1  Arf51F  Vps45  TSG101
12                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 Karybeta3  Nxt1  Hpr1  Tnpo-SR  Pym  thoc6  Bin1  Nup35  Nup107  Sec13  cdm  CG7483  Phax  Kr-h2  Kap-alpha3
13                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          Rat1  Nxt1  Ns3  eIF6  CG2972  Nop60B  hoip  Ns1  Ns2  CG8064  CG8549  Non1  CG9107
14                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             Spase25  SrpRalpha  Srp54k  Srp14  Srp72  CG9240
15                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     l(2)37Cg  CG12267  RpI12  RpII15  CG3756  RpI135  RpII33
16                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           CG1673  Eno  Taldo  Rpe  Pgk  AsnS  Got2  P5cr  Idh3b  PyK  mAcon1
17                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               beta4GalT7  botv  Hsepi  Hs6st
18                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         ATPsynbeta  ND-30  ND-B14.5B  ND-ASHI  ND-MLRQ  ND-19  UQCR-11  UQCR-C2  ATPsynF  Cyt-c1  ND-39  ATPsynD  VhaPPA1-1  Vha36-1  ATPsynB  ND-51  ND-ACP
19                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        Hacl  CG12338  CG14778  CG1662  ScpX  CG18003  CG5009  Pex19  Pex3  Prx5  Sod3  Amacr
20                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                CG17333  Taldo  Rpe  fbp  Pgd
21                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                CG15093  CG1673  Mccc2  Hmgs  Mtpbeta  CG6638
22                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   spt4  Tnpo-SR  Su(Tpl)  Cdk9  BicD  TSG101
23                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         CG11164  dpa  Mcm5  Mcm7  PCNA  RPA2
24                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                Vhl  Uba3  Cul5  Roc1a  UbcE2H  CG2924  fzy  Prp19  CG5823  eff  Uba2  CG7747
25                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    Sec20  Vti1a  Syx5  Gos28

結構取れた。

特に

あたりは物質分泌に関わっていそう

CAFE5で増幅が検出された遺伝子をShinyGOによりGO解析にかける

出力ファイルはこんな感じ。

cafe_go=read.csv("/Users/kosukesano/bio/for_shinygo/241020_cafe_shinygo.csv", sep=",")|>
    print()
  Enrichment.FDR nGenes Pathway.Genes Fold.Enrichment
1     0.09676809      1            24        19.40000
2     0.09676809      1            35        13.30286
3     0.09676809      1            32        14.55000
                                      Pathway
1     Path:dme00030 Pentose phosphate pathway
2          Path:dme00052 Galactose metabolism
3 Path:dme00500 Starch and sucrose metabolism
                                                   URL   Genes
1 http://www.genome.jp/kegg-bin/show_pathway?dme00030      Gld
2 http://www.genome.jp/kegg-bin/show_pathway?dme00052   Mal-A8
3 http://www.genome.jp/kegg-bin/show_pathway?dme00500   Mal-A8

取れてきた経路は以下の通り。

1021

オジロを含めたIQ-TREE続き

各ファイルのヘッダーに遺伝子名が入っていて、別種扱いになって出力ファイルが出なかった。

~/tools/for_IQTREE/241019_plusOjiroIQTREE_3.pyを作成、実行した。

### IQTREE_3.pyの中身

import os

# ファイルのヘッダーを変更する関数
def modify_headers(input_file, output_file):
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        for line in infile:
            if line.startswith(">"):
                # ヘッダー行の最初の四文字を抽出して書き換え
                outfile.write(f">{line[1:5]}\n")
            else:
                outfile.write(line)

# 作業ディレクトリ内のすべての ".maffted.trimed.fa" ファイルに対して処理を適用し、出力を別ディレクトリに保存
def process_directory(input_directory, output_directory):
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    
    for filename in os.listdir(input_directory):
        if filename.endswith(".maffted.trimed.fa"):
            input_file = os.path.join(input_directory, filename)
            output_file = os.path.join(output_directory, filename.replace(".maffted.trimed.fa", ".maffted.trimed.edit.fa"))
            modify_headers(input_file, output_file)
            print(f"Processed: {filename}")

# 実行するディレクトリを指定
input_directory = "/home/kosukesano/tools/for_orthofinder/241019_6plusOjiro/OrthoFinder/Results_Oct19/ManualPhylo_data"
output_directory = "/home/kosukesano/tools/for_IQTREE/241019_plusOjiro"

process_directory(input_directory, output_directory)

ここでmakerun.pymanualphylo.shをもう一度作成し、実行した。

1030

進化解析で使うゾウムシの選定

Dendroctonus ponderosaeと同族の種

これらはどちらもBRAKERをかけてみて、どっちかorどっちも使う。

なお、Ceutorhynchus assimilisおよびその亜科であるCeutorhynchinae(サルゾウムシ亜科)に該当する種はNCBIには登録されていなかった……。

とりあえずこの2種のゲノムを遺伝研に送ってソフトマスク

Dfroについて

:~/Downloads$ scp /Users/kosukesano/Downloads/Dfro.zip kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_softmask/nama_data/Dfro_data
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
|  ..o.o...*   o+ |
|   . . ..= + o* o|
|       .  = oB +.|
|      +.oo .+E+o.|
|      .*S.  o.o.+|
|      .o. .  . .+|
|      ..   +  . o|
|        . ..oo . |
|         . .=.   |
+----[SHA256]-----+
Dfro.zip                                                                                                                                                 100%   52MB  90.6MB/s   00:00    
>CM078935.1 Dendroctonus frontalis isolate MC-2024a chromosome 1, whole genome shotgun sequence
AAGTTGTAACGATGTACTGTTCTGTTTAAGCTCCGATAGTTCTTCGTCGGATTGTTGTGCTTGACATAGTTCTGCTGTAC
TGACTATGACAGGAAAAGCTATTGCATCAATGCGAGAAAGTGCATCAGCAATAACGTTTTCAACACCAGATACGTGGACT
ATGTGAGTCGTGAACTGTCCTATAAAGTCCAGGTGTCTGAGTTGTCTTGGAGTGGCTTTGTCAGCCTTTTGCCGGAAGGC
GAATATGAGCGGTTTGTGATCGGTCTTGATGATTAACTGTCGGCCTTCTACCATAAATCTGAAGAATTTCAAACTGGTGT
AAATAGCTAGTAGTTCACGATCGTACGTGCTATAACTCGACTGAGCGTTGCTGAATTTCTTTGAAAAGAATCCTAGTGGC
TCCCAGCAACCATTATTGTGTTGTTCTAGCACGGCACCCATTGCGGTATCCGATGCATCGGTGTAAAGAGCTAATGGTGC
ATTATCTTTAGGATGGTTTAGTAAAGAAGCTGTAGTCAGTTGTTGCTTGCATTGTTCAAAAGCTTCTTTAAGTTCATCCG
TCCAGTTGATGGGGCGTTTGTCACGTTTCTTAGCGCCAGCTAGTAATGCATGAAGTGGTGTTTGTGTAGAGGCAGCATTT
CGGATAAAACGCCTATAGAAATTGATAACGCCTAGAAAACGTCGCATGTCTGCAATGGTAGCTGGTTGAGGATATTCCTG
TATTGAGGCTACTCGTTCGGGTAACGGTCTGGTACCTTCACCATTTATCAGGTATCCCAGATagttaatttcagattttc
caAACTGGCACTTGGCGATATTAATGGATATGCCGTATTGTCTCAGTCGATTGAAGACTTGAGTGAGATGTTCTATGTGT
TCTTCAAGAGTTGATGAAGCTACAAGTATGTCGTCAATATAACAAAAGACAAACTTGAAGTCGTGTAGCACTAGATTCAT
AAATCGTTGGAAAGTCTGAGCTGCATTACATAATCCAAATGTCATTACGTTGAACTCGAATAAACCAAATGGCGTTATGA
TGGCGGTTTTGGGACGATCTTCTGGTAGTACAGGTATTTGATTATATGCCCTTACAAGATCTAGTGAACTGAATATTTTG
GTGCCTTGCAATTTGTGTGCAAAATCTTGAATGTGAGCAATTGGATATTTGTCTGGTAGGGTGACACTGTTAAGACGACG
ATAATCACCACAAGGGCGCCAGTctccatttttctttggtaCTAAGTGTAGTGGGCTGGCCCAAGGACTATTTGATGGAC
TGCACATACCTTGTTCTACCATAAAATTGAATTCCGCTTCTGCTAGCTTCAATTTTTCGGGAGACAGCCTTCTAGCTCTG
TCAGCTAGGGGTGGACCAGTAGTCTCTATGTGGTGGTAGATTCCATGTGAAGGATTTAAGATGCTTGGTTTGGAAGGAAC
AGttaaatctgcaaatttgtccaaaagttttttaaatggagtATTTCCGGATATAGTGCATATATTTCCTTGTGGGTACG
CAAATATTGCTCCTTTACTATTTAAAGTTGTGGTGCTGTCGATgagctttttgtttttaagatcTACCAATAGTCCAAAG
TGATTGAGAAAATCTGCTCCGAGCATGGGACGTGAAACATCTGCTATCGTGAATGGCCAACGAAAGAGGCGGCGTAGGCC
GAGATCCACGTTTAGTAATTGCTGTCCATAAGTATTTATTTCTGTGTTGTTGGCTGCGTATAGTTTGTAGTTCGATGCG

ソフトマスクはされていそう

kosukesano@at138:~/tools/for_softmask/nama_data/Dfro_data$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat -a ncbi_dataset/data/GCA_040113315.1/GCA_040113315.1_ASM4011331v1_genomic.fna 
file                                                                        format  type  num_seqs      sum_len  min_len  avg_len     max_len     Q1     Q2      Q3  sum_gap         N50  Q20(%)  Q30(%)  GC(%)
ncbi_dataset/data/GCA_040113315.1/GCA_040113315.1_ASM4011331v1_genomic.fna  FASTA   DNA        373  173,601,287    1,003  465,419  42,498,342  1,754  3,607  18,959        0  24,829,404       0       0  36.62
kosukesano@at138:~/tools/for_softmask/nama_data/Dfro_data$ 

コンティグ数373とめちゃくちゃに繋がってて草。これをBRAKERの生データディレクトリにコピーする。

kosukesano@at138:~/tools/for_braker/nama_data$ cp ~/tools/for_softmask/nama_data/Dfro_data/ncbi_dataset/data/GCA_040113315.1/GCA_040113315.1_ASM4011331v1_genomic.fna Dfro.fna
kosukesano@at138:~/tools/for_braker/nama_data$ ls
231117_Madara_softmasked.fasta  Elaeidobius_kamerunicus.masked.fna                  Pst_NotUseEDTA_upper5000.fna  femo_busco.sh.o26221930     kohuki_busco.sh.po26238968
241017_Ojiro_masked.fa          GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked.gz  Sfem_RNAseq                   femo_busco.sh.pe26221930    kohuki_softmasked.fasta
BUSCO_OUTPUT_FEMO_GENOME        Madara_RNAseq                                       Sfem_pilon_softmasked.fasta   femo_busco.sh.po26221930    kohuki_softmasked_upper1000.fasta
BUSCO_OUTPUT_KOHUKI_GENOME      Ojiro_RNAseq                                        Sfem_softmasked.fasta         kohuki_busco.sh             length.txt
Dfro.fna                        Pst_NotUseEDTA.fna                                  busco_downloads               kohuki_busco.sh.e26238968   madaralength.txt
Ekam_NotUseEDTA.fna             Pst_NotUseEDTA_upper1000.fna                        femo_busco.sh                 kohuki_busco.sh.o26238968
Ekam_oomoji.fna                 Pst_NotUseEDTA_upper10000.fna                       femo_busco.sh.e26221930       kohuki_busco.sh.pe26238968
kosukesano@at138:~/tools/for_braker/nama_data$

~/tools/for_braker/DfroDfro_braker.shを作成、実行した。

### Dfro_braker.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 16
#$ -l s_vmem=12G
#$ -l mem_req=12G
echo start at
date

source /home/kosukesano/tools/pyenv_env/braker_profile

braker.pl --genome=/home/kosukesano/tools/for_braker/nama_data/Dfro.fna\
        --prot_seq=/home/kosukesano/tools/Arthropoda.fa\
        --threads=16\
        --species=Dfro\
        --AUGUSTUS_CONFIG_PATH=/usr/share/augustus/config\
        --AUGUSTUS_BIN_PATH=/usr/bin\
        --AUGUSTUS_SCRIPTS_PATH=/usr/share/augustus/scripts\
        --GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin\
        --PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin\
        --TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin

date

Dvalについて

>JAJTJO010000001.1 Dendroctonus valens isolate Dva2017 scaffold_0, whole genome shotgun sequence
CCATAGTAGTAGATATTCATAATAAGTAGCAAAAAGGATCAGTCTACTCACATAGTAAGTCTACATTTCGAGAAACAAAA
TTGGAGAAGCTTTGAGGTAAACCAGTGGATGCGAATATGTGATAGATTAATTCCTGTTTGAGTTTTGCCTGAGAAAAGCG
TCGAAgctgaaagaaaaattgaaatccGTTATGAGACCAAACGCTCATAACCCACTTTAACCACATGAATACTAATTTAA
ATGCGCTACTACCTGAATTACCATTAGACCACAGCTGAAACGTTACGGTAACCATCGCTGGAAAATAGACAACATTTAGA
TGTAAACTACCACTTTCGACACACCCAGCTTGAATTCCGCATCCCATGAAGAACCGTCTCGATAAATTACTATTTCGGAA
TGCTTGATTTCTGTTAATGCAATTTGTCTCCCCGACATTAATTCTACGGAAATCGGTTGCAGATTTATTTCGAAATTGTT
GTATGCTCAAGCTGAAGACAAGAGAGATATTTTTGTCTGTCTCGTGTCTTTAAGATTAGCCGATTTTGTTTGATGTCACT
CGGCGCTTTAAATTTATGTGCCTCAAGATGTTCAAACACATTTATTGGACTGCAGTATTTTTCTCATGTTGCAATAAAAC
GCGAGATAAGATTGCAGCAAATCGGGCAATTGttgattcaaaaaattgttccCATACAAAACACcgattataaattaaat
tgtttttaaaaattattgtttgctGACTTCACAAAGGAAAATACTTTCTTACATATCTATAACCAAAACTTCTCGGAAGT
TAGTCTCAAATAGTCGTGGAAATGTGTAACTTCTTTTTTGTGTGCAACTTTAAAGCaaaatttcttgttttttgtAGGTC

こっちもソフトマスクされているっぽい。

kosukesano@at138:~/tools/for_softmask/nama_data/Dval_data$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat -a ncbi_dataset/data/GCA_024550625.1/GCA_024550625.1_ASM2455062v1_genomic.fna 
file                                                                        format  type  num_seqs      sum_len  min_len    avg_len    max_len     Q1      Q2       Q3  sum_gap        N50  Q20(%)  Q30(%)  GC(%)
ncbi_dataset/data/GCA_024550625.1/GCA_024550625.1_ASM2455062v1_genomic.fna  FASTA   DNA        922  322,406,506      905  349,681.7  9,688,036  9,173  21,821  258,314        0  1,658,008       0       0  36.67
kosukesano@at138:~/tools/for_softmask/nama_data/Dval_data$

min=905が若干不安だけど、多分いけんべ

~/tools/for_braker/DvalDval_braker.shを作成、qsubで投げた。

### Dval_braker.sh

#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 16
#$ -l s_vmem=12G
#$ -l mem_req=12G
echo start at
date

source /home/kosukesano/tools/pyenv_env/braker_profile

braker.pl --genome=/home/kosukesano/tools/for_braker/nama_data/Dval.fna\
        --prot_seq=/home/kosukesano/tools/Arthropoda.fa\
        --threads=16\
        --species=Dval\
        --AUGUSTUS_CONFIG_PATH=/usr/share/augustus/config\
        --AUGUSTUS_BIN_PATH=/usr/bin\
        --AUGUSTUS_SCRIPTS_PATH=/usr/share/augustus/scripts\
        --GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin\
        --PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin\
        --TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin

date

Pissodes strobiゲノムの短いコンティグを除去

現状のデータはこんな感じ

kosukesano@at138:~/tools/for_braker/nama_data$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat -a Pst_NotUseEDTA.fna
file                format  type  num_seqs        sum_len  min_len   avg_len    max_len     Q1     Q2      Q3  sum_gap      N50  Q20(%)  Q30(%)  GC(%)
Pst_NotUseEDTA.fna  FASTA   DNA     84,140  2,025,024,129      201  24,067.3  2,554,738  2,904  6,869  16,574        0  105,159       0       0  32.01

全長が2,025,024,129bp(2Gbp)で、コンティグ数が84,140

これについてseqkitを使ってPstrの短いコンティグを削る。どれくらい削ればいいかわからなかったので、とりあえず3パターン作ってみた

  • 1000bp未満を削ったもの

    kosukesano@at138:~/tools/for_braker/nama_data$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit seq -m 1000 Pst_NotUseEDTA.fna > Pst_NotUseEDTA_upper1000.fna
    [WARN] you may switch on flag -g/--remove-gaps to remove spaces
    kosukesano@at138:~/tools/for_braker/nama_data$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat -a Pst_NotUseEDTA_upper1000.fna
    file                          format  type  num_seqs        sum_len  min_len   avg_len    max_len     Q1     Q2        Q3  sum_gap      N50  Q20(%)  Q30(%)  GC(%)
    Pst_NotUseEDTA_upper1000.fna  FASTA   DNA     83,427  2,024,625,705    1,000  24,268.2  2,554,738  2,970  6,975  16,735.5        0  105,221       0       0  32.01

    全長が2,024,625,705bp(99.9%)、コンティグ数が83,427(99.1%)に。

    これを使ってBRAKERを実行

    kosukesano@at138:~/tools/for_braker/Pstr/upper_1_k$ ls
    Pstr_1k_braker.sh
  • 5000bp未満を削ったもの

    kosukesano@at138:~/tools/for_braker/nama_data$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit seq -m 5000 Pst_NotUseEDTA.fna > Pst_NotUseEDTA_upper5000.fna
    [WARN] you may switch on flag -g/--remove-gaps to remove spaces
    kosukesano@at138:~/tools/for_braker/nama_data$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat -a Pst_NotUseEDTA_upper5000.fna
    file                          format  type  num_seqs        sum_len  min_len   avg_len    max_len     Q1      Q2      Q3  sum_gap      N50  Q20(%)  Q30(%)  GC(%)
    st_NotUseEDTA_upper5000.fna  FASTA   DNA     50,349  1,939,201,939    5,000  38,515.2  2,554,738  8,113  13,479  30,372        0  115,418       0       0  31.86

    全長が1,939,201,939bp(95.7%)、コンティグ数が50,349(59.8%)に。

  • 10000bp未満を削ったもの

    kosukesano@at138:~/tools/for_braker/nama_data$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit seq -m 10000 Pst_NotUseEDTA.fna > Pst_NotUseEDTA_upper10000.fna
    [WARN] you may switch on flag -g/--remove-gaps to remove spaces
    kosukesano@at138:~/tools/for_braker/nama_data$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat -a Pst_NotUseEDTA_upper10000.fna
    file                           format  type  num_seqs        sum_len  min_len  avg_len    max_len      Q1      Q2        Q3  sum_gap      N50  Q20(%)  Q30(%)  GC(%)
    Pst_NotUseEDTA_upper10000.fna  FASTA   DNA     32,051  1,806,682,273   10,000   56,369  2,554,738  14,212  22,767  52,547.5        0  131,293       0       0  31.61

    全長が1,806,682,273bp(89.2%)、コンティグ数が32,051(38.1%)に。

コフキゲノムの短いコンティグを除去

kosukesano@at138:~/tools/for_braker/nama_data$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat -a kohuki_softmasked.fasta 
file                     format  type   num_seqs        sum_len  min_len  avg_len  max_len  Q1   Q2   Q3  sum_gap     N50  Q20(%)  Q30(%)  GC(%)
kohuki_softmasked.fasta  FASTA   DNA   2,372,896  3,664,337,660       48  1,544.2  151,585  86  100  363        0  15,058       0       0  32.29

とりあえず1000bp未満を切り捨ててみる

kosukesano@at138:~/tools/for_braker/nama_data$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat -a kohuki_softmasked_upper1000.fasta 
file                               format  type  num_seqs        sum_len  min_len  avg_len  max_len     Q1     Q2      Q3  sum_gap     N50  Q20(%)  Q30(%)  GC(%)
kohuki_softmasked_upper1000.fasta  FASTA   DNA    397,892  3,349,012,532    1,000  8,416.9  151,585  1,967  4,187  10,292        0  17,050       0       0  32.27

総配列数は3,349,012,532bp(91.4%)、コンティグ数は397,892(16.8%)になった。コンティグ数がめちゃくちゃに減った。

これを使ってBRAKERをかけてみる

1031

オジロを加えた7種でのCAFE結果

241019のCAFEの結果をローカルに移した。

:~/bio/for_cafe$ mkdir 241019_cafe_original_data
:~/bio/for_cafe$ cd 241019_cafe_original_data/
:~/bio/for_cafe/241019_cafe_original_data$ pwd
/Users/kosukesano/bio/for_cafe/241019_cafe_original_data
:~/bio/for_cafe/241019_cafe_original_data$ scp -r kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_cafe/241019_plusOjiro/results /Users/kosukesano/bio/for_cafe/241019_cafe_original_data
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
|  ..o.o...*   o+ |
|   . . ..= + o* o|
|       .  = oB +.|
|      +.oo .+E+o.|
|      .*S.  o.o.+|
|      .o. .  . .+|
|      ..   +  . o|
|        . ..oo . |
|         . .=.   |
+----[SHA256]-----+
Base_clade_results.txt                                                                                                     100%  209    11.5KB/s   00:00    
Base_asr.tre                                                                                                               100% 1870KB   3.5MB/s   00:00    
Base_count.tab                                                                                                             100%  293KB   3.8MB/s   00:00    
Base_results.txt                                                                                                           100%  160     8.8KB/s   00:00    
Base_family_likelihoods.txt                                                                                                100%  153KB   4.0MB/s   00:00    
Base_family_results.txt                                                                                                    100%  144KB   3.7MB/s   00:00    
Base_branch_probabilities.tab                                                                                              100%   72KB   2.5MB/s   00:00    
Base_change.tab                                                                                                            100%  398KB   4.3MB/s   00:00    
:~/bio/for_cafe/241019_cafe_original_data$ 
lines = readLines("/Users/kosukesano/bio/for_cafe/241019_cafe_original_data/results/Base_asr.tre")

# TREESセクションのみを抽出する
trees_start <- which(grepl("BEGIN TREES;", lines))
trees_end <- which(grepl("END;", lines))
trees_lines <- lines[(trees_start + 1):(trees_end - 1)]

# 不要なスペースを削除
trees_lines <- gsub("^\\s+|\\s+$", "", trees_lines)

# データフレームに変換
library(tibble)
trees_df = tibble(Tree = trees_lines)

ex=trees_df|>###各枝で優位に増減したOGをTRUE/FALSEで表す
  #lines|> 
  tidyr::separate(Tree, into = c("OG_num", "tree"), sep = r"(\s=\s)")|>#系統樹の文字列をOG番号の列とツリーの列に分割
  dplyr::mutate(OG_num = stringr::str_extract(OG_num, "OG\\d+")) |>#OG番号の列の余計な文字を除去
  dplyr::mutate(Smad = stringr::str_detect(tree, pattern="Smad<1>\\*_")) |>#有意な差のある遺伝子ファミリー(*がついてるやつ)を検出
  dplyr::mutate(Ojiro = stringr::str_detect(tree, pattern="Ojiro<0>\\*_")) |>#有意な差のある遺伝子ファミリー(*がついてるやつ)を検出
  dplyr::mutate(Smad_Ojiro = stringr::str_detect(tree, pattern="<2>\\*_")) |>
  dplyr::mutate(Cass = stringr::str_detect(tree, pattern="<3>\\*_")) |>
  dplyr::mutate(Ojiro_Cass = stringr::str_detect(tree, pattern="<4>\\*_")) |>
  dplyr::mutate(Dpon = stringr::str_detect(tree, pattern="<5>\\*_")) |>
  dplyr::mutate(Cass_Dpon = stringr::str_detect(tree, pattern="<6>\\*_")) |>
  dplyr::mutate(Agra = stringr::str_detect(tree, pattern="<7>\\*_")) |>
  dplyr::mutate(Curculionidae = stringr::str_detect(tree, pattern="<8>\\*_")) |>
  dplyr::mutate(Sory = stringr::str_detect(tree, pattern="<9>\\*_")) |>
  dplyr::mutate(Curculionoidea = stringr::str_detect(tree, pattern="<10>\\*_")) |>
  dplyr::mutate(Tcas = stringr::str_detect(tree, pattern="<11>\\*_")) |>
  dplyr::mutate(all = stringr::str_detect(tree, pattern="<12>\\*_"))|>
  print()
# A tibble: 8,318 × 15
   OG_num    tree  Smad  Ojiro Smad_Ojiro Cass  Ojiro_Cass Dpon  Cass_Dpon Agra 
   <chr>     <chr> <lgl> <lgl> <lgl>      <lgl> <lgl>      <lgl> <lgl>     <lgl>
 1 OG0000000 (Tca… FALSE FALSE FALSE      TRUE  TRUE       TRUE  TRUE      TRUE 
 2 OG0000002 (Tca… TRUE  FALSE FALSE      TRUE  TRUE       TRUE  FALSE     TRUE 
 3 OG0000005 (Tca… FALSE TRUE  FALSE      TRUE  FALSE      TRUE  FALSE     TRUE 
 4 OG0000006 (Tca… FALSE TRUE  FALSE      TRUE  FALSE      FALSE FALSE     FALSE
 5 OG0000007 (Tca… TRUE  FALSE TRUE       TRUE  FALSE      TRUE  FALSE     FALSE
 6 OG0000008 (Tca… TRUE  TRUE  FALSE      TRUE  FALSE      TRUE  TRUE      TRUE 
 7 OG0000010 (Tca… FALSE FALSE FALSE      TRUE  FALSE      TRUE  FALSE     FALSE
 8 OG0000011 (Tca… TRUE  FALSE FALSE      TRUE  FALSE      TRUE  FALSE     FALSE
 9 OG0000012 (Tca… FALSE FALSE FALSE      TRUE  FALSE      FALSE FALSE     FALSE
10 OG0000013 (Tca… FALSE TRUE  FALSE      TRUE  FALSE      FALSE FALSE     FALSE
# ℹ 8,308 more rows
# ℹ 5 more variables: Curculionidae <lgl>, Sory <lgl>, Curculionoidea <lgl>,
#   Tcas <lgl>, all <lgl>
### マダラでのみ有意なものを抽出
Smad_ex = ex|>
  dplyr::filter(Smad == "TRUE")|>
  dplyr::filter(Ojiro == "FALSE")|>
  dplyr::filter(Smad_Ojiro == "FALSE")|>
  dplyr::filter(Cass == "FALSE")|>
  dplyr::filter(Ojiro_Cass == "FALSE")|>
  dplyr::filter(Dpon == "FALSE")|>
  dplyr::filter(Cass_Dpon == "FALSE")|>
  dplyr::filter(Agra == "FALSE")|>
  dplyr::filter(Curculionidae == "FALSE")|>
  dplyr::filter(Sory == "FALSE")|>
  dplyr::filter(Curculionoidea == "FALSE")|>
  dplyr::filter(Tcas == "FALSE")|>
  dplyr::filter(all == "FALSE") |>
  print()
# A tibble: 19 × 15
   OG_num    tree  Smad  Ojiro Smad_Ojiro Cass  Ojiro_Cass Dpon  Cass_Dpon Agra 
   <chr>     <chr> <lgl> <lgl> <lgl>      <lgl> <lgl>      <lgl> <lgl>     <lgl>
 1 OG0000158 (Tca… TRUE  FALSE FALSE      FALSE FALSE      FALSE FALSE     FALSE
 2 OG0000182 (Tca… TRUE  FALSE FALSE      FALSE FALSE      FALSE FALSE     FALSE
 3 OG0000378 (Tca… TRUE  FALSE FALSE      FALSE FALSE      FALSE FALSE     FALSE
 4 OG0000440 (Tca… TRUE  FALSE FALSE      FALSE FALSE      FALSE FALSE     FALSE
 5 OG0000479 (Tca… TRUE  FALSE FALSE      FALSE FALSE      FALSE FALSE     FALSE
 6 OG0000567 (Tca… TRUE  FALSE FALSE      FALSE FALSE      FALSE FALSE     FALSE
 7 OG0000684 (Tca… TRUE  FALSE FALSE      FALSE FALSE      FALSE FALSE     FALSE
 8 OG0001789 (Tca… TRUE  FALSE FALSE      FALSE FALSE      FALSE FALSE     FALSE
 9 OG0002093 (Tca… TRUE  FALSE FALSE      FALSE FALSE      FALSE FALSE     FALSE
10 OG0002847 (Tca… TRUE  FALSE FALSE      FALSE FALSE      FALSE FALSE     FALSE
11 OG0002850 (Tca… TRUE  FALSE FALSE      FALSE FALSE      FALSE FALSE     FALSE
12 OG0003493 (Tca… TRUE  FALSE FALSE      FALSE FALSE      FALSE FALSE     FALSE
13 OG0003527 (Tca… TRUE  FALSE FALSE      FALSE FALSE      FALSE FALSE     FALSE
14 OG0006333 (Tca… TRUE  FALSE FALSE      FALSE FALSE      FALSE FALSE     FALSE
15 OG0006392 (Tca… TRUE  FALSE FALSE      FALSE FALSE      FALSE FALSE     FALSE
16 OG0007730 (Tca… TRUE  FALSE FALSE      FALSE FALSE      FALSE FALSE     FALSE
17 OG0008099 (Tca… TRUE  FALSE FALSE      FALSE FALSE      FALSE FALSE     FALSE
18 OG0009924 (Tca… TRUE  FALSE FALSE      FALSE FALSE      FALSE FALSE     FALSE
19 OG0011431 (Tca… TRUE  FALSE FALSE      FALSE FALSE      FALSE FALSE     FALSE
# ℹ 5 more variables: Curculionidae <lgl>, Sory <lgl>, Curculionoidea <lgl>,
#   Tcas <lgl>, all <lgl>
Bc=read.csv("/Users/kosukesano/bio/for_cafe/241019_cafe_original_data/results/Base_change.tab", sep="\t")

Smad_bc=Bc |>###マダラで増加した0Gの0G番号を抽出したファイル
  dplyr::select("FamilyID","Smad.1.") |>#OG番号の列とマダラでの遺伝子数の増減量が書いてある列のみを抽出
  dplyr::mutate(Smad.1.= stringr::str_extract(Smad.1., r"(^\d+)")) |>#マダラの遺伝子量増減の列のうち、数字のみのもの(-がついておらず、遺伝子数が増加しているもの)を抽出
  tidyr::drop_na()|>
  dplyr::filter(Smad.1. != 0) #遺伝子数の増加分が0のものを除去


Smad_df=dplyr::inner_join(Smad_bc, Smad_ex, by = c(FamilyID = "OG_num"))|>##マダラで優位に増加したOGのOG番号を抽出したファイル
print()
    FamilyID Smad.1.
1  OG0000479      19
2  OG0000567       5
3  OG0000684       5
4  OG0001789       2
5  OG0002093       6
6  OG0002847       9
7  OG0002850      10
8  OG0003493       7
9  OG0003527       8
10 OG0006333       4
11 OG0006392       4
12 OG0007730       5
13 OG0008099       5
14 OG0009924       3
15 OG0011431       3
                                                                                                                                                                                                                   tree
1  (Tcas<11>_1:236.2,(Sory<9>_1:133.223,(Agra<7>_2:112.172,(Dpon<5>_0:101.931,(Cass<3>_0:92.1677,(Smad<1>*_20:78.9022,Ojiro<0>_1:78.9022)<2>_1:13.2655)<4>_1:9.76356)<6>_1:10.2402)<8>_1:21.0518)<10>_1:102.977)<12>_1;
2   (Tcas<11>_5:236.2,(Sory<9>_4:133.223,(Agra<7>_2:112.172,(Dpon<5>_1:101.931,(Cass<3>_1:92.1677,(Smad<1>*_8:78.9022,Ojiro<0>_3:78.9022)<2>_3:13.2655)<4>_3:9.76356)<6>_3:10.2402)<8>_3:21.0518)<10>_3:102.977)<12>_4;
3   (Tcas<11>_3:236.2,(Sory<9>_1:133.223,(Agra<7>_1:112.172,(Dpon<5>_4:101.931,(Cass<3>_2:92.1677,(Smad<1>*_8:78.9022,Ojiro<0>_3:78.9022)<2>_3:13.2655)<4>_3:9.76356)<6>_3:10.2402)<8>_3:21.0518)<10>_3:102.977)<12>_3;
4   (Tcas<11>_6:236.2,(Sory<9>_3:133.223,(Agra<7>_3:112.172,(Dpon<5>_0:101.931,(Cass<3>_0:92.1677,(Smad<1>*_3:78.9022,Ojiro<0>_0:78.9022)<2>_1:13.2655)<4>_1:9.76356)<6>_1:10.2402)<8>_2:21.0518)<10>_2:102.977)<12>_3;
5   (Tcas<11>_4:236.2,(Sory<9>_0:133.223,(Agra<7>_1:112.172,(Dpon<5>_1:101.931,(Cass<3>_0:92.1677,(Smad<1>*_7:78.9022,Ojiro<0>_1:78.9022)<2>_1:13.2655)<4>_1:9.76356)<6>_1:10.2402)<8>_1:21.0518)<10>_1:102.977)<12>_2;
6  (Tcas<11>_1:236.2,(Sory<9>_1:133.223,(Agra<7>_0:112.172,(Dpon<5>_1:101.931,(Cass<3>_0:92.1677,(Smad<1>*_10:78.9022,Ojiro<0>_0:78.9022)<2>_1:13.2655)<4>_1:9.76356)<6>_1:10.2402)<8>_1:21.0518)<10>_1:102.977)<12>_1;
7  (Tcas<11>_2:236.2,(Sory<9>_0:133.223,(Agra<7>_0:112.172,(Dpon<5>_0:101.931,(Cass<3>_0:92.1677,(Smad<1>*_11:78.9022,Ojiro<0>_0:78.9022)<2>_1:13.2655)<4>_1:9.76356)<6>_1:10.2402)<8>_1:21.0518)<10>_1:102.977)<12>_1;
8   (Tcas<11>_2:236.2,(Sory<9>_0:133.223,(Agra<7>_1:112.172,(Dpon<5>_0:101.931,(Cass<3>_0:92.1677,(Smad<1>*_8:78.9022,Ojiro<0>_0:78.9022)<2>_1:13.2655)<4>_1:9.76356)<6>_1:10.2402)<8>_1:21.0518)<10>_1:102.977)<12>_1;
9   (Tcas<11>_1:236.2,(Sory<9>_0:133.223,(Agra<7>_1:112.172,(Dpon<5>_0:101.931,(Cass<3>_0:92.1677,(Smad<1>*_9:78.9022,Ojiro<0>_0:78.9022)<2>_1:13.2655)<4>_1:9.76356)<6>_1:10.2402)<8>_1:21.0518)<10>_1:102.977)<12>_1;
10  (Tcas<11>_1:236.2,(Sory<9>_0:133.223,(Agra<7>_0:112.172,(Dpon<5>_0:101.931,(Cass<3>_2:92.1677,(Smad<1>*_5:78.9022,Ojiro<0>_1:78.9022)<2>_1:13.2655)<4>_1:9.76356)<6>_1:10.2402)<8>_1:21.0518)<10>_1:102.977)<12>_1;
11  (Tcas<11>_2:236.2,(Sory<9>_2:133.223,(Agra<7>_0:112.172,(Dpon<5>_0:101.931,(Cass<3>_0:92.1677,(Smad<1>*_5:78.9022,Ojiro<0>_0:78.9022)<2>_1:13.2655)<4>_1:9.76356)<6>_1:10.2402)<8>_1:21.0518)<10>_1:102.977)<12>_1;
12  (Tcas<11>_1:236.2,(Sory<9>_0:133.223,(Agra<7>_1:112.172,(Dpon<5>_0:101.931,(Cass<3>_0:92.1677,(Smad<1>*_6:78.9022,Ojiro<0>_0:78.9022)<2>_1:13.2655)<4>_1:9.76356)<6>_1:10.2402)<8>_1:21.0518)<10>_1:102.977)<12>_1;
13  (Tcas<11>_1:236.2,(Sory<9>_0:133.223,(Agra<7>_0:112.172,(Dpon<5>_1:101.931,(Cass<3>_0:92.1677,(Smad<1>*_6:78.9022,Ojiro<0>_0:78.9022)<2>_1:13.2655)<4>_1:9.76356)<6>_1:10.2402)<8>_1:21.0518)<10>_1:102.977)<12>_1;
14  (Tcas<11>_1:236.2,(Sory<9>_0:133.223,(Agra<7>_0:112.172,(Dpon<5>_0:101.931,(Cass<3>_2:92.1677,(Smad<1>*_4:78.9022,Ojiro<0>_0:78.9022)<2>_1:13.2655)<4>_1:9.76356)<6>_1:10.2402)<8>_1:21.0518)<10>_1:102.977)<12>_1;
15  (Tcas<11>_1:236.2,(Sory<9>_0:133.223,(Agra<7>_0:112.172,(Dpon<5>_0:101.931,(Cass<3>_0:92.1677,(Smad<1>*_4:78.9022,Ojiro<0>_0:78.9022)<2>_1:13.2655)<4>_1:9.76356)<6>_1:10.2402)<8>_1:21.0518)<10>_1:102.977)<12>_1;
   Smad Ojiro Smad_Ojiro  Cass Ojiro_Cass  Dpon Cass_Dpon  Agra Curculionidae
1  TRUE FALSE      FALSE FALSE      FALSE FALSE     FALSE FALSE         FALSE
2  TRUE FALSE      FALSE FALSE      FALSE FALSE     FALSE FALSE         FALSE
3  TRUE FALSE      FALSE FALSE      FALSE FALSE     FALSE FALSE         FALSE
4  TRUE FALSE      FALSE FALSE      FALSE FALSE     FALSE FALSE         FALSE
5  TRUE FALSE      FALSE FALSE      FALSE FALSE     FALSE FALSE         FALSE
6  TRUE FALSE      FALSE FALSE      FALSE FALSE     FALSE FALSE         FALSE
7  TRUE FALSE      FALSE FALSE      FALSE FALSE     FALSE FALSE         FALSE
8  TRUE FALSE      FALSE FALSE      FALSE FALSE     FALSE FALSE         FALSE
9  TRUE FALSE      FALSE FALSE      FALSE FALSE     FALSE FALSE         FALSE
10 TRUE FALSE      FALSE FALSE      FALSE FALSE     FALSE FALSE         FALSE
11 TRUE FALSE      FALSE FALSE      FALSE FALSE     FALSE FALSE         FALSE
12 TRUE FALSE      FALSE FALSE      FALSE FALSE     FALSE FALSE         FALSE
13 TRUE FALSE      FALSE FALSE      FALSE FALSE     FALSE FALSE         FALSE
14 TRUE FALSE      FALSE FALSE      FALSE FALSE     FALSE FALSE         FALSE
15 TRUE FALSE      FALSE FALSE      FALSE FALSE     FALSE FALSE         FALSE
    Sory Curculionoidea  Tcas   all
1  FALSE          FALSE FALSE FALSE
2  FALSE          FALSE FALSE FALSE
3  FALSE          FALSE FALSE FALSE
4  FALSE          FALSE FALSE FALSE
5  FALSE          FALSE FALSE FALSE
6  FALSE          FALSE FALSE FALSE
7  FALSE          FALSE FALSE FALSE
8  FALSE          FALSE FALSE FALSE
9  FALSE          FALSE FALSE FALSE
10 FALSE          FALSE FALSE FALSE
11 FALSE          FALSE FALSE FALSE
12 FALSE          FALSE FALSE FALSE
13 FALSE          FALSE FALSE FALSE
14 FALSE          FALSE FALSE FALSE
15 FALSE          FALSE FALSE FALSE

15個の遺伝子ファミリーがマダラで特異的に増加した

これに機能アノテーションをつける

# Orthogroups.tsvの読み込み
orthogroups <- ### OG番号とそれに対応するマダラ遺伝子IDのファイル
  read.delim("/Users/kosukesano/bio/for_cafe/241019_orthofinder_data/Orthogroups.tsv", header=FALSE, sep="\t", 
             #stringsAsFactors=FALSE,
             #col.names = "Data"
             skip=1
  )|>
  dplyr::select("V1", "V6")



Smad_df2=dplyr::left_join(Smad_df, orthogroups, by = c(FamilyID = "V1"))|>#マダラで有意に増加したOGのOG番号とマダラの遺伝子IDを紐付ける
  dplyr::select(!c(Smad.1., tree)) 

Smad_df3 <- Smad_df2 %>%###マダラでのみ増加した遺伝子のgene_IDとOG番号
  separate_rows(V6, sep = ", ") %>%
  rename(gene_ID = V6, family_ID = FamilyID)|>
  dplyr::mutate(gene_ID = stringr::str_replace(gene_ID, "^Smad_", "")) 

fa<-read.csv("/Users/kosukesano/bio/functional_annotation/merged_with_gene_function.csv", sep=",")


Smad_df4=dplyr::left_join(Smad_df3, fa, by = c(gene_ID = "Madara"))|>###完成系
  print()
# A tibble: 114 × 23
   family_ID Smad  Ojiro Smad_Ojiro Cass  Ojiro_Cass Dpon  Cass_Dpon Agra 
   <chr>     <lgl> <lgl> <lgl>      <lgl> <lgl>      <lgl> <lgl>     <lgl>
 1 OG0000479 TRUE  FALSE FALSE      FALSE FALSE      FALSE FALSE     FALSE
 2 OG0000479 TRUE  FALSE FALSE      FALSE FALSE      FALSE FALSE     FALSE
 3 OG0000479 TRUE  FALSE FALSE      FALSE FALSE      FALSE FALSE     FALSE
 4 OG0000479 TRUE  FALSE FALSE      FALSE FALSE      FALSE FALSE     FALSE
 5 OG0000479 TRUE  FALSE FALSE      FALSE FALSE      FALSE FALSE     FALSE
 6 OG0000479 TRUE  FALSE FALSE      FALSE FALSE      FALSE FALSE     FALSE
 7 OG0000479 TRUE  FALSE FALSE      FALSE FALSE      FALSE FALSE     FALSE
 8 OG0000479 TRUE  FALSE FALSE      FALSE FALSE      FALSE FALSE     FALSE
 9 OG0000479 TRUE  FALSE FALSE      FALSE FALSE      FALSE FALSE     FALSE
10 OG0000479 TRUE  FALSE FALSE      FALSE FALSE      FALSE FALSE     FALSE
# ℹ 104 more rows
# ℹ 14 more variables: Curculionidae <lgl>, Sory <lgl>, Curculionoidea <lgl>,
#   Tcas <lgl>, all <lgl>, gene_ID <chr>, Ecoli <chr>, Ecol_GeneFunction <chr>,
#   Dmelanogaster <chr>, Dmel_GeneFunction <chr>, Tcastaneum <chr>,
#   Tcas_GeneFunction <chr>, Soryzae <chr>, Sory_GeneFunction <chr>

114個の遺伝子が抽出できた

2024年11月

ETEの導入

~/tools/for_ETE?ディレクトリを作成した。

本家サイト?によるとcondaで入れるといいと書いてあったが、ete3ete_toolchainのインストール時にパッケージの依存関係の競合が発生した

(ete3) kosukesano@at139:~/tools/for_ETE$ ete3 build check
Command 'ete3' not found, did you mean:
  command 'etex' from deb texlive-binaries (2021.20210626.59705-1ubuntu0.1)
Try: apt install <deb name>
(ete3) kosukesano@at139:~/tools/for_ETE$ 
Pinned packages:
  - python 3.8.*


Could not solve for environment specs
Encountered problems while solving:
  - package ete_toolchain-3.0.0-h73706c9_0 requires pmodeltest 1.4.*, but none of the providers can be installed

The environment can't be solved, aborting the operation

結局GitHubに書いてあった方法を使った。


(base) kosukesano@at138:~/tools/for_ETE$ pip install https://github.com/etetoolkit/ete/archive/ete4.zip
Collecting https://github.com/etetoolkit/ete/archive/ete4.zip
  Downloading https://github.com/etetoolkit/ete/archive/ete4.zip
     / 4.3 MB 15.1 MB/s 0:00:00
  Installing build dependencies ... done
  Getting requirements to build wheel ... done
  Preparing metadata (pyproject.toml) ... done
Collecting bottle (from ete4==4.0.0b2)
  Downloading bottle-0.13.2-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting brotli (from ete4==4.0.0b2)
  Downloading Brotli-1.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (5.5 kB)
Collecting numpy (from ete4==4.0.0b2)
  Downloading numpy-2.1.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 60.9/60.9 kB 2.0 MB/s eta 0:00:00
Collecting scipy (from ete4==4.0.0b2)
  Downloading scipy-1.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 60.8/60.8 kB 3.3 MB/s eta 0:00:00
Requirement already satisfied: requests in /lustre7/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/lib/python3.10/site-packages (from ete4==4.0.0b2) (2.28.1)
Requirement already satisfied: charset-normalizer<3,>=2 in /lustre7/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/lib/python3.10/site-packages (from requests->ete4==4.0.0b2) (2.1.1)
Requirement already satisfied: idna<4,>=2.5 in /lustre7/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/lib/python3.10/site-packages (from requests->ete4==4.0.0b2) (3.4)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /lustre7/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/lib/python3.10/site-packages (from requests->ete4==4.0.0b2) (1.26.13)
Requirement already satisfied: certifi>=2017.4.17 in /lustre7/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/lib/python3.10/site-packages (from requests->ete4==4.0.0b2) (2024.2.2)
Downloading bottle-0.13.2-py2.py3-none-any.whl (104 kB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 104.1/104.1 kB 5.9 MB/s eta 0:00:00
Downloading Brotli-1.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.0 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.0/3.0 MB 61.7 MB/s eta 0:00:00
Downloading numpy-2.1.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.3 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 16.3/16.3 MB 83.5 MB/s eta 0:00:00
Downloading scipy-1.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (41.2 MB)
   ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 41.2/41.2 MB 53.6 MB/s eta 0:00:00
Building wheels for collected packages: ete4
  Building wheel for ete4 (pyproject.toml) ... done
  Created wheel for ete4: filename=ete4-4.0.0b2-cp310-cp310-linux_x86_64.whl size=3142932 sha256=b89f676934222cd824353b76450b0b96d9e243646c2da6aea7d17b2dd479bcc6
  Stored in directory: /tmp/pip-ephem-wheel-cache-ksa3wdi5/wheels/89/21/61/80025b2b6138108e4f3ee405a77c230502321e3e0a470f8492
Successfully built ete4
Installing collected packages: brotli, bottle, numpy, scipy, ete4
Successfully installed bottle-0.13.2 brotli-1.1.0 ete4-4.0.0b2 numpy-2.1.2 scipy-1.14.1
(base) kosukesano@at138:~/tools/for_ETE$ 

ただこれ間違えてmambabase環境でやっちゃったので、改めてete4の環境を作ってそちらでインストールした。

(base) kosukesano@at138:~/tools/for_ETE$ conda create -n ete4
Collecting package metadata (current_repodata.json): done
Solving environment: done


==> WARNING: A newer version of conda exists. <==
  current version: 22.9.0
  latest version: 24.9.2

Please update conda by running

    $ conda update -n base -c conda-forge conda



## Package Plan ##

  environment location: /lustre7/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/envs/ete4



Proceed ([y]/n)? y

Preparing transaction: done
Verifying transaction: done
Executing transaction: done
#
# To activate this environment, use
#
#     $ conda activate ete4
#
# To deactivate an active environment, use
#
#     $ conda deactivate

Retrieving notices: ...working... done
(base) kosukesano@at138:~/tools/for_ETE$ conda activate ete4
(ete4) kosukesano@at138:~/tools/for_ETE$ python -c "import ete4; print(ete4.__version__)"
Traceback (most recent call last):
  File "<string>", line 1, in <module>
ModuleNotFoundError: No module named 'ete4'
(ete4) kosukesano@at138:~/tools/for_ETE$ pip install https://github.com/etetoolkit/ete/archive/ete4.zip
Defaulting to user installation because normal site-packages is not writeable
Collecting https://github.com/etetoolkit/ete/archive/ete4.zip
  Downloading https://github.com/etetoolkit/ete/archive/ete4.zip
     - 4.3 MB 13.6 MB/s 0:00:00
  Installing build dependencies ... done
  Getting requirements to build wheel ... done
  Installing backend dependencies ... done
  Preparing metadata (pyproject.toml) ... done
Building wheels for collected packages: ete4
  Building wheel for ete4 (pyproject.toml) ... done
  Created wheel for ete4: filename=ete4-0.0.0-cp310-cp310-linux_x86_64.whl size=3708108 sha256=62c7f65349abb0abd88fe4df93dfcd7c4c36fb3ae69ae7588a2863f40f725ba0
  Stored in directory: /tmp/pip-ephem-wheel-cache-lfo6wso4/wheels/89/21/61/80025b2b6138108e4f3ee405a77c230502321e3e0a470f8492
Successfully built ete4
Installing collected packages: ete4
Successfully installed ete4-0.0.0

実際に動かしてみる

(ete4) kosukesano@at138:~/tools/for_ETE$ python -c "import ete4; print(ete4.__version__)"
4.0.0-beta
(ete4) kosukesano@at138:~/tools/for_ETE$

動いてるっぽい。

環境もセットで作ったのでETE環境立ち上げ用のプロファイルを~/tools/pyenv_envに作成する。

### ETE_profileの中身

source ~/.bash_profile
source ~/pyenv_conda_environment/.pyenv_profile
pyenv global mambaforge-22.9.0-3



# >>> conda initialize >>>
# !! Contents within this block are managed by 'conda init' !!
__conda_setup="$('/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/bin/conda' 'shell.bash' 'hook' 2> /dev/null)"
if [ $? -eq 0 ]; then
    eval "$__conda_setup"
else
    if [ -f "/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3//etc/profile.d/conda.sh" ]; then
        . "/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/etc/profile.d/conda.sh"
    else
        export PATH="/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/bin:$PATH"
    fi
fi
unset __conda_setup
if [ -f "/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/etc/profile.d/mamba.sh" ]; then
    . "/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/etc/profile.d/mamba.sh"
fi
# <<< conda initialize <<<

conda activate ete4

python -c "import ete4; print(ete4.__version__)"

実際に実行するとこんな感じ

kosukesano@at138:~/tools/pyenv_env$ source ETE_profile 
4.0.0-beta
(ete4) kosukesano@at138:~/tools/pyenv_env$ 

環境に入ると同時に4.0.0-betaって言ってくれる

試しにprint.pyを作成して実行してみる。ちなみにOG0008871.nwkは0930のASTRALの入力に使ったファイルから持ってきた。

### print.pyの中身

import ete4

print(ete4.__version__)

from ete4 import Tree

t = Tree(open('OG0008871.nwk'))

t.explore()
(ete4) kosukesano@at138:~/tools/for_ETE/test_241101$ python print.py 
4.0.0-beta
Traceback (most recent call last):
  File "/lustre7/home/kosukesano/tools/for_ETE/test_241101/print.py", line 9, in <module>
    t.explore()
  File "ete4/core/tree.pyx", line 1095, in ete4.core.tree.Tree.explore
  File "/home/kosukesano/.local/lib/python3.10/site-packages/ete4/smartview/gui/server.py", line 36, in <module>
    from bottle import (
ModuleNotFoundError: No module named 'bottle'
(ete4) kosukesano@at138:~/tools/for_ETE/test_241101$ 

こういうエラー。bottleをインストールしてもう一回

(ete4) kosukesano@at138:~/tools/for_ETE/test_241101$ pip install bottle
Defaulting to user installation because normal site-packages is not writeable
Collecting bottle
  Downloading bottle-0.13.2-py2.py3-none-any.whl (104 kB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 104.1/104.1 KB 2.6 MB/s eta 0:00:00
Installing collected packages: bottle
Successfully installed bottle-0.13.2
(ete4) kosukesano@at138:~/tools/for_ETE/test_241101$ python print.py 
4.0.0-beta
https://github.com/etetoolkit/ete-data/raw/main/layouts/pfam2color.json -> /home/kosukesano/.local/share/ete/pfam2color.json
https://github.com/etetoolkit/ete-data/raw/main/layouts/smart2color.json -> /home/kosukesano/.local/share/ete/smart2color.json
Added tree tree-1 with id 0.
(ete4) kosukesano@at138:~/tools/for_ETE/test_241101$ ls

なんか挙動が変わった。

1104

DfroのBRAKER結果

kosukesano@at139:~/tools/for_braker/Dfro$ ls
Dfro_braker.sh  Dfro_braker.sh.e27165165  Dfro_braker.sh.o27165165  Dfro_braker.sh.pe27165165  Dfro_braker.sh.po27165165  braker  gpu
kosukesano@at139:~/tools/for_braker/Dfro$ ls braker/
Augustus  GeneMark-EP  GeneMark-ES  braker.aa  braker.codingseq  braker.gtf  braker.log  errors  genome_header.map  hintsfile.gff  prothint.gff  species  what-to-cite.txt
kosukesano@at139:~/tools/for_braker/Dfro$ cd braker/
kosukesano@at139:~/tools/for_braker/Dfro/braker$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat braker.aa
file       format  type     num_seqs    sum_len  min_len  avg_len  max_len
braker.aa  FASTA   Protein    19,978  8,071,872        2      404   18,409
kosukesano@at139:~/tools/for_braker/Dfro/braker$ 

ちゃんとできた。

Orthofinderのためにヘッダーを書き換えておく。

kosukesano@at138:~/tools/for_braker/Dfro/braker$ ls
Augustus  GeneMark-EP  GeneMark-ES  braker.aa  braker.codingseq  braker.gtf  braker.log  errors  genome_header.map  hintsfile.gff  prothint.gff  species  what-to-cite.txt
kosukesano@at138:~/tools/for_braker/Dfro/braker$ cp braker.aa ../Dfro.fasta
kosukesano@at138:~/tools/for_braker/Dfro/braker$ cd ../
kosukesano@at138:~/tools/for_braker/Dfro$ ls
Dfro.fasta  Dfro_braker.sh  Dfro_braker.sh.e27165165  Dfro_braker.sh.o27165165  Dfro_braker.sh.pe27165165  Dfro_braker.sh.po27165165  braker  gpu
kosukesano@at138:~/tools/for_braker/Dfro$ edit.py
edit.py: command not found
kosukesano@at138:~/tools/for_braker/Dfro$ nano edit.py
kosukesano@at138:~/tools/for_braker/Dfro$ python edit.py 
../Dfro/RemakeHedder_Dfro/Dfro.fasta に保存しました。
kosukesano@at138:~/tools/for_braker/Dfro$ ls
Dfro.fasta  Dfro_braker.sh  Dfro_braker.sh.e27165165  Dfro_braker.sh.o27165165  Dfro_braker.sh.pe27165165  Dfro_braker.sh.po27165165  RemakeHedder_Dfro  braker  edit.py  gpu
kosukesano@at138:~/tools/for_braker/Dfro$ ls RemakeHedder_Dfro/
Dfro.fasta
kosukesano@at138:~/tools/for_braker/Dfro$ 

DvalのBRAKER結果

kosukesano@at139:~/tools/for_braker/Dval$ ls
Dval_braker.sh  Dval_braker.sh.e27165208  Dval_braker.sh.o27165208  Dval_braker.sh.pe27165208  Dval_braker.sh.po27165208  braker  gpu
kosukesano@at139:~/tools/for_braker/Dval$ ls braker/
Augustus  GeneMark-EP  GeneMark-ES  braker.aa  braker.codingseq  braker.gtf  braker.log  errors  genome_header.map  hintsfile.gff  prothint.gff  species  what-to-cite.txt
kosukesano@at139:~/tools/for_braker/Dval$ cd braker/
kosukesano@at139:~/tools/for_braker/Dval/braker$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat braker.aa
file       format  type     num_seqs     sum_len  min_len  avg_len  max_len
braker.aa  FASTA   Protein    37,002  12,331,785        2    333.3   21,873
kosukesano@at139:~/tools/for_braker/Dval/braker$

こっちもできてそう。これもedit.pyを使ってヘッダー書き換えた。

PstrのBRAKERやり直し

1000bp未満を切ったやつは終わってた。他はまだ。

kosukesano@at139:~/tools/for_braker/Pstr/upper_1_k/braker$ ls
GeneMark-EP                     braker.gtf_temp                                   gc_content.out                                         nuc.fasta                 train.gb
GeneMark-EP.stdout              braker.log                                        gene_stat.yaml                                         optimize_augustus.stdout  train.gb.test
GeneMark-ES                     cmd.log                                           genemark_evidence.gff                                  prevHints.gff             train.gb.train
GeneMark-ES.stdout              diamond                                           genemark_hintsfile.gff                                 proteins.fa               train.gb.train.test
Spaln                           downsample_traingenes.log                         genome.fa                                              prothint.gff              train.gb.train.train
aa2nonred.stdout                ensure_min_n_training_genes.stdout                genome.fa.cidx                                         prothint_augustus.gff     traingenes.good.fa
aug_hints.lst                   errors                                            genome_header.map                                      protl4pnvjoe              traingenes.good.gtf
augustus.hints.tmp.gtf          etrain.bad.lst                                    genome_split                                           secondetraining.stdout    traingenes.good.nr.fa
augustus.hints_iter1.aa         evidence.gff                                      getAnnoFastaFromJoingenes.augustus.hints_hints.stdout  secondtest.stdout         traingenes.gtf
augustus.hints_iter1.codingseq  filterGenemark.stdout                             getAnnoFastaFromJoingenes.augustus.hints_tmp.stdout    seed_proteins.faa         uniqueSeeds.gtf
augustus.hints_iter1.gff        firstetraining.stdout                             getAnnoFastaFromJoingenes.braker_.stdout               species                   what-to-cite.txt
augustus.hints_iter1.gtf        firsttest.stdout                                  good_genes.lst                                         tmp_opt_Pstr_1k
braker.aa                       fix_IFS_log_hoefclri                              hints.job.lst                                          top_chains.gff
braker.codingseq                fix_in_frame_stop_codon_genes_augustus.hints.log  hintsfile.gff                                          train.f.gb
braker.gtf                      gbFilterEtraining.stdout                          nonred.loci.lst                                        train.ff.gb
kosukesano@at139:~/tools/for_braker/Pstr/upper_1_k/braker$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat braker.aa
file       format  type     num_seqs     sum_len  min_len  avg_len  max_len
braker.aa  FASTA   Protein   221,240  51,377,361        2    232.2   11,401
kosukesano@at139:~/tools/for_braker/Pstr/upper_1_k/braker$ 

なんか出力ファイル多くね?でもできてそうではある。

kosukesano@at139:~/tools/for_braker/Pstr/upper_1_k/braker$ cp braker.aa ../Pstr_upper1k.fasta
kosukesano@at139:~/tools/for_braker/Pstr/upper_1_k/braker$ 
kosukesano@at139:~/tools/for_braker/Pstr/upper_1_k$ python edit.py 
../upper_1_k/RemakeHedder_Pstr/Pstr_upper1k.fasta に保存しました。
kosukesano@at139:~/tools/for_braker/Pstr/upper_1_k$ ls
Pstr_1k_braker.sh  Pstr_1k_braker.sh.e27173011  Pstr_1k_braker.sh.o27173011  Pstr_1k_braker.sh.pe27173011  Pstr_1k_braker.sh.po27173011  Pstr_upper1k.fasta  RemakeHedder_Pstr  braker  edit.py
kosukesano@at139:~/tools/for_braker/Pstr/upper_1_k$ less RemakeHedder_Pstr/Pstr_upper1k.fasta 
kosukesano@at139:~/tools/for_braker/Pstr/upper_1_k$

edit.pyを使ってヘッダー書き換え。

合計10種でのOrthofinder

~/tools/for_orthofinder/241104_10spを作成、そこに10種分の.fastaファイルをコピーした。

kosukesano@at138:~/tools/for_orthofinder$ mkdir 241104_10sp
kosukesano@at138:~/tools/for_orthofinder$ ls
241019_6plusOjiro                    RemakeHedder_6sp                                          Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest_CDS.sh.o26310331
241104_10sp                          Smad_Agra_Cass_Dpon_Sory_Tcas_CDS_dir                     Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest_CDS.sh.pe26291666
CO1_6sp                              Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir              Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest_CDS.sh.pe26310331
Orthofinder_240917_RH.sh             Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir                   Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest_CDS.sh.po26291666
Orthofinder_240917_RH.sh.e26802366   Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest.sh                Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest_CDS.sh.po26310331
Orthofinder_240917_RH.sh.o26802366   Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest.sh.e26224546      make_philo_tree
Orthofinder_240917_RH.sh.pe26802366  Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest.sh.o26224546      seven_sp.sh
Orthofinder_240917_RH.sh.po26802366  Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest.sh.pe26224546     seven_sp.sh.e26639936
Orthofinder_241019.sh                Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest.sh.po26224546     seven_sp.sh.o26639936
Orthofinder_241019.sh.e27076911      Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest_CDS.sh            seven_sp.sh.pe26639936
Orthofinder_241019.sh.o27076911      Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest_CDS.sh.e26291666  seven_sp.sh.po26639936
Orthofinder_241019.sh.pe27076911     Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest_CDS.sh.e26310331
Orthofinder_241019.sh.po27076911     Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest_CDS.sh.o26291666
kosukesano@at138:~/tools/for_orthofinder$ cd 241104_10sp/
kosukesano@at138:~/tools/for_orthofinder/241104_10sp$ cp ~/tools/for_braker/Dfro/RemakeHedder_Dfro/Dfro.fasta ../241104_10sp/
kosukesano@at138:~/tools/for_orthofinder/241104_10sp$ cp ~/tools/for_braker/Dval/RemakeHedder_Dval/Dval.fasta ../241104_10sp/
kosukesano@at138:~/tools/for_orthofinder/241104_10sp$ cp ~/tools/for_braker/Pstr/upper_1_k/RemakeHedder_Pstr/Pstr_upper1k.fasta ../241104_10sp/Pstr.fasta
kosukesano@at138:~/tools/for_orthofinder/241104_10sp$ ls
Dfro.fasta  Dval.fasta  Pstr.fasta
kosukesano@at138:~/tools/for_orthofinder/241104_10sp$ 
kosukesano@at138:~/tools/for_orthofinder/241104_10sp$ cp ../241019_6plusOjiro/*.fasta ../241104_10sp/
kosukesano@at138:~/tools/for_orthofinder/241104_10sp$ ls
Agra.fasta  Cass.fasta  Dfro.fasta  Dpon.fasta  Dval.fasta  Ojiro.fasta  Pstr.fasta  Smad.fasta  Sory.fasta  Tcas.fasta
kosukesano@at138:~/tools/for_orthofinder/241104_10sp$ 

ここでOrthofinder_241104.shを作成、qsubで投げた。

### Orthofinder_241104.sh

#$ -S /bin/bash
#$ -cwd
#$ -pe def_slot 16
#$ -l intel
echo start at
date


singularity exec /usr/local/biotools/o/orthofinder:2.5.4--hdfd78af_0 orthofinder\
        -f /home/kosukesano/tools/for_orthofinder/241104_10sp\
        -t 16

date

マダラを抜いてオジロを入れた6種でのOrthofinder

~/tools/for_orthofinder/241104_5plusOjiroを作成、そこに以下の方法で6種のゲノムをコピーした。

kosukesano@at138:~/tools/for_orthofinder/241104_5plusOjiro$ cp ../241019_6plusOjiro/*fasta ../241104_5plusOjiro/
kosukesano@at138:~/tools/for_orthofinder/241104_5plusOjiro$ ls
Agra.fasta  Cass.fasta  Dpon.fasta  Ojiro.fasta  Smad.fasta  Sory.fasta  Tcas.fasta
kosukesano@at138:~/tools/for_orthofinder/241104_5plusOjiro$ rm Smad.fasta 
kosukesano@at138:~/tools/for_orthofinder/241104_5plusOjiro$ ls
Agra.fasta  Cass.fasta  Dpon.fasta  Ojiro.fasta  Sory.fasta  Tcas.fasta
kosukesano@at138:~/tools/for_orthofinder/241104_5plusOjiro$ 

ここでOrthofinder_241104_5plusOjiro.shを作成し、qsubで実行した。

こっちの方が早く終わった。

マダラを抜いてオジロを入れた6種でのIQTREE

~/tools/for_IQTREE/241104_5sp_plusOjiroを作成、その下でIQTREE_1.pyを実行した。

### IQTREE_1.py

##analysis_manual.pptxの#46も参照

##AFTER you made MSA file(all_seq.fa) in DDBJ with makeMSA.sh

##時間は10secほど

import numpy as np
import pandas as pd
import os

path = "/home/kosukesano/tools/for_orthofinder/241104_5plusOjiro/OrthoFinder/Results_Nov04/"
withpath = "../../for_orthofinder/241104_5plusOjiro/OrthoFinder/Results_Nov04/"

OGs = pd.read_table(path + "Orthogroups/Orthogroups.tsv")

# ManualPhylo_dataディレクトリが存在しない場合は作成
os.makedirs(path + "ManualPhylo_data", exist_ok=True)

##with openは相対パスしか受け付けないらしい
new = pd.DataFrame()
with open(withpath + "Orthogroups/Orthogroups_SingleCopyOrthologues.txt", "r") as fin:
    for line in fin:
        li = line.rstrip()
        new = pd.concat([new, OGs[OGs["Orthogroup"] == li]])
print(new)
new.to_csv(path + "ManualPhylo_data/OG_list.txt", sep = " ", index = False, header = False)

##OG_list.txtと同じ順番の種名リストであるspecies_list.txtを作成
##できたOG_list.txtに、DDBJで作ったall_seq.faで配列情報を与える。

li = []
allspe = OGs.columns.tolist()
allspe2 = allspe[1:len(allspe)]
with open(withpath + "ManualPhylo_data/species_list.txt", "w") as file:
   for column_name in allspe2:
       file.write("%s\n" % column_name)

実行するとこんな感じ。

kosukesano@at138:~/tools/for_IQTREE/241104_5sp_plusOjiro$ python IQTREE_1.py 
      Orthogroup                Agra              Cass                Dpon           Ojiro                Sory                Tcas
8088   OG0008088  Agra_P_050292700.1  Cass_AG9761214.1  Dpon_P_019755574.2   Ojir_g1996.t1  Sory_P_030761209.1  Tcas_P_008195282.1
8089   OG0008089  Agra_P_050292731.1  Cass_AH1135743.1  Dpon_P_048519923.1   Ojir_g7978.t1  Sory_P_030765758.1  Tcas_P_008196870.1
8090   OG0008090  Agra_P_050292732.1  Cass_AG9767756.1  Dpon_P_019773495.1   Ojir_g6189.t1  Sory_P_030765067.1  Tcas_P_015836383.1
8091   OG0008091  Agra_P_050292739.1  Cass_AG9768060.1  Dpon_P_019769194.2   Ojir_g6137.t1  Sory_P_030755089.1     Tcas_P_969265.1
8092   OG0008092  Agra_P_050292743.1  Cass_AG9767942.1  Dpon_P_019767966.1   Ojir_g4737.t1  Sory_P_030750408.1     Tcas_P_971491.1
...          ...                 ...               ...                 ...             ...                 ...                 ...
10018  OG0010018  Agra_P_050316302.1  Cass_AG9767812.1  Dpon_P_019773553.2   Ojir_g6202.t1  Sory_P_030765225.1  Tcas_P_008194975.1
10021  OG0010021  Agra_P_050316346.1  Cass_AG9766968.1  Dpon_P_019768198.1   Ojir_g7446.t1  Sory_P_030747218.1  Tcas_P_015834541.1
10023  OG0010023  Agra_P_050316372.1  Cass_AG9765979.1  Dpon_P_019758814.2   Ojir_g9692.t1  Sory_P_030763414.1     Tcas_P_971352.1
10026  OG0010026  Agra_P_050316407.1  Cass_AG9762302.1  Dpon_P_019758828.1   Ojir_g9703.t1  Sory_P_030763403.1  Tcas_P_008196032.1
10027  OG0010027  Agra_P_050316412.1  Cass_AG9765463.1  Dpon_P_019755650.2  Ojir_g11721.t1  Sory_P_030763780.1  Tcas_P_008199734.2

[1480 rows x 7 columns]
kosukesano@at138:~/tools/for_IQTREE/241104_5sp_plusOjiro$ 

続いてconcatinate.shを作成、実行した。

### concatinate.shの中身

#$ -S /bin/bash
#$ -cwd

echo start at
date

# Enter the directory containing the fasta files
filesout="/home/kosukesano/tools/for_orthofinder/241104_5plusOjiro"  ## Please replace with the actual directory containing the fasta files

# Define the output directory and output file
new="/home/kosukesano/tools/for_IQTREE/241104_5sp_plusOjiro"
mkdir -p $new

# Concatenate all fasta files into one file
for file in "$filesout"/*.fasta; do
    cat "$file" >> "${new}/all_seq.fa"
done


date

これを実行するとこんな感じ

kosukesano@at138:~/tools/for_IQTREE/241104_5sp_plusOjiro$ nano concatinate.sh
kosukesano@at138:~/tools/for_IQTREE/241104_5sp_plusOjiro$ sh concatinate.sh 
start at
Mon Nov  4 18:48:00 JST 2024
Mon Nov  4 18:48:01 JST 2024
kosukesano@at138:~/tools/for_IQTREE/241104_5sp_plusOjiro$ ls
IQTREE_1.py  all_seq.fa  concatinate.sh
kosukesano@at138:~/tools/for_IQTREE/241104_5sp_plusOjiro$ 

次にIQTREE_2.pyを作り、実行。

### IQTREE_2.pyの中身

import sys
from Bio import SeqIO

path = "../../for_orthofinder/241104_5plusOjiro/OrthoFinder/Results_Nov04/ManualPhylo_data/"

fasta_in = sys.argv[1]                                  #1番目の引数には上記のall_seq.faなどfastaファイルを指定する
query_in = sys.argv[2]                                  #2番目の引数には上記のOG_list.txtなどオーソログファイルを指定する

for q in open(query_in, "r"):                                           #オーソログファイルを開いて1行づつ読み込む
        query = q.split()                                                       #スペース毎に切りとってリスト形式でqueryに保存する
        f = open(path + query[0], 'w')                                  #最初の列(OG名)と同じ名前のファイルを作成する
        for record in SeqIO.parse(fasta_in, 'fasta'):   #fastaファイルを開くSeqIOを使ってパースする(1項目づつ読み込む)
                id_part = record.id                                     #fastaのID部分を読み込む
                desc_part = record.description                  #fastaのdescription部分を読み込む
                seq = record.seq                                        #fastaの配列部分を読み込む
                for i in range(len(query)):                         #オーソログファイル中の各OGに含まれる配列数を数えて、その分繰り返す(python2の人はrange を x rangeにする)
                        if desc_part == query[i] :                  #オーソログファイルの配列descriptionとfastaの配列descriptionが一致したら、、、
                                fasta_seq = '>' + desc_part + '\n' + seq + '\n'         #fasta形式に整え
                                print(fasta_seq)                                        #標準出力にfastaを出力(進行状況把握用)
                                f.write(str(fasta_seq))                             #各OGファイルにfastaを出力
        f.close()

実行のコマンドは以下の通り

python IQTREE_2.py all_seq.fa ../../for_orthofinder/241104_5plusOjiro/OrthoFinder/Results_Nov04/ManualPhylo_data/OG_list.txt 

結構時間かかる。

続いて~/tools/for_orthofinder/241104_5plusOjiro/OrthoFinder/Results_Nov04/ManualPhylo_dataに移動、そこでMPT環境に入る。

source ~/tools/pyenv_env/ManualPhilo_profile 

そこでalign.shをコピーして作成、実行した。

### align.shの中身

#!/bin/sh
#$ -S /bin/bash
#$ -cwd
#$ -v PATH
awk '{print($1)}' $1 | while read x; do #引数に前述のOG_list.txtなどのOGリストを指定する。
    mafft --auto $x > $x.maffted.fa
    trimal -in $x.maffted.fa -out $x.maffted.trimed.fa -keepheader -htmlout $x.maffted.trimed.fa.html -automated1
done

実行のコマンドは以下の通り。

sh align.sh OG_list.txt

これもそこそこ時間がかかる

続いて、~/tools/for_IQTREE/241104_5sp_plusOjiroに移動し、IQTREE_3.pyを作成

### IQTREE_3.pyの中身

import os

# ファイルのヘッダーを変更する関数
def modify_headers(input_file, output_file):
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        for line in infile:
            if line.startswith(">"):
                # ヘッダー行の最初の四文字を抽出して書き換え
                outfile.write(f">{line[1:5]}\n")
            else:
                outfile.write(line)

# 作業ディレクトリ内のすべての ".maffted.trimed.fa" ファイルに対して処理を適用し、出力を別ディレクトリに保存
def process_directory(input_directory, output_directory):
    if not os.path.exists(output_directory):
        os.makedirs(output_directory)
    
    for filename in os.listdir(input_directory):
        if filename.endswith(".maffted.trimed.fa"):
            input_file = os.path.join(input_directory, filename)
            output_file = os.path.join(output_directory, filename.replace(".maffted.trimed.fa", ".maffted.trimed.edit.fa"))
            modify_headers(input_file, output_file)
            print(f"Processed: {filename}")

# 実行するディレクトリを指定
input_directory = "/home/kosukesano/tools/for_orthofinder/241104_5plusOjiro/OrthoFinder/Results_Nov04/ManualPhylo_data"
output_directory = "/home/kosukesano/tools/for_IQTREE/241104_5sp_plusOjiro"

process_directory(input_directory, output_directory)

これを実行すると、*.maftted.trimed.edit.faファイルができる。

ここでmakerun.pyを作成。

### makerun.pyの中身

import glob
import os

list = []
for i in glob.glob('*.maffted.trimed.edit.fa'):
        list.append(os.path.split(i)[1].rstrip())

#print(list[0])


##ls | grep "maffted.trimed.edit.fa" > otamesi.txtで、完成したOGをotamesi.txtに一行ずつ保存
##ファイルの行数をカウント。このカウント数がfor文のrangeに入る数になる

f = open("run.nex", "w")
f.write("#nexus" + "\n")
f.write("begin sets;" + "\n")
character = "charset part"
for line, i in zip(list, range(4997)):
        row = character + str(i+1) + " = " + line + ": ;"
        f.write("\t" + row + "\n")
f.write("end;" + "\n")
f.close()

これでrun.nexファイルができる。これを使ってmanualphylo.shを実行。

### manualphylo.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 16

date
singularity exec -e /usr/local/biotools/i/iqtree:2.3.3--h21ec9f0_0 iqtree2 -sp run.nex -nt AUTO -bb 1000 -cptime 600
date

これをqsubで投げた。

マダラを抜いてオジロを入れた6種でのASTRAL前準備

IQTREEと同じディレクトリで、*.maftted.trimed.edit.faファイルが揃った後に行う。

makealltree.shを作成し、実行。

#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 16
echo start at
date

# Singularityイメージのパスを指定
SINGULARITY_IMAGE="/usr/local/biotools/i/iqtree:2.3.3--h21ec9f0_0"

# 作業ディレクトリに移動
cd /home/kosukesano/tools/for_IQTREE/241104_5sp_plusOjiro

# 出力ファイル
output_file="all_trees.nwk"

# 既存の出力ファイルを削除
if [ -f $output_file ]; then
    rm $output_file
fi

# *.maffted.trimed.edit.fa ファイルを処理
for file in *.maffted.trimed.edit.fa; do
    # ファイル名から拡張子を除いたベース名を取得
    base_name=$(basename $file .maffted.trimed.edit.fa)

    # Singularityを使用してIQ-TREEを実行して系統樹を作成
    singularity exec -e $SINGULARITY_IMAGE iqtree2 -s $file -nt AUTO -bb 1000 -cptime 600 -pre ${base_name}

    # 作成された系統樹ファイル (.treefile) を output_file に追加
    if [ -f ${base_name}.treefile ]; then
        echo -n "${base_name}: " >> $output_file
        cat ${base_name}.treefile >> $output_file
        echo "" >> $output_file
    else
        echo "Error: ${base_name}.treefile not found" >&2
    fi
done

echo "All trees have been written to $output_file"

date

これをqsubで投げた。

ETEの続き

ETE4のファイル読み込みなどについてわかったことのまとめ

### print.pyの中身

import ete3

print(ete4.__version__)
print(ete3.__version__)

from ete4 import Tree

t = Tree(open('OG0008871.nwk')) #ツリー読み込み

t.explore()

print(t) #ツリーのプリント

# ルートノードに子ノードを追加
t.add_child(name="child1")
t.add_child(name="child2")

print(t)

### エラーが起きたコード###
# child1ノードを削除
#child1 = t & "child1"  # "child1"のノードにアクセス
#child1.detach()        # ノードをツリーから削除
###

# "child1"という名前のノードにアクセスする
#child1 = t.search_nodes(name="child1")[0]  # 検索結果はリストで返されるので、[0]で最初のノードを取得
#child1.detach()  # ノードをツリーから削除
###

# "child1"という名前のノードにアクセスする
child1 = list(t.search_nodes(name="child1"))[0]  # リストに変換してから最初のノードを取得
child1.detach()  # ノードをツリーから削除

print(t)

# "child1"という名前のノードにアクセスする
child2 = list(t.search_nodes(name="child2"))[0]  # リストに変換してから最初のノードを取得
# child2ノードの名前を変更
child2.name = "new_child2"
print(t)

# "Smad"という名前のノードにアクセスする
Smad = list(t.search_nodes(name="Smad"))[0]  # リストに変換してから最初のノードを取得
# Smadノードの名前を変更
Smad.name = "Smad#"
print(t)

# "child1"という名前のノードにアクセスする
new_child2 = list(t.search_nodes(name="new_child2"))[0]  # リストに変換してから最初のノードを取得
new_child2.detach()

# ファイルに書き込み
t . write ( parser = 1 ,  outfile = "new_tree.nwk" )

これを実行するとこんな感じ

(ete4) kosukesano@at138:~/tools/for_ETE/test_241101$ python print.py 
4.0.0-beta
3.1.3
Added tree tree-1 with id 0.
 ╭╴Agra
    ╭─┬╴Cass
─┤ ╭─┤ ╰╴Smad
 ├─┤ ╰╴Dpon
  ╰╴Sory
 ╰╴Tcas
 ╭╴Agra
    ╭─┬╴Cass
  ╭─┤ ╰╴Smad
─┼─┤ ╰╴Dpon
  ╰╴Sory
 ├╴Tcas
 ├╴child1
 ╰╴child2
 ╭╴Agra
    ╭─┬╴Cass
  ╭─┤ ╰╴Smad
─┼─┤ ╰╴Dpon
  ╰╴Sory
 ├╴Tcas
 ╰╴child2
 ╭╴Agra
    ╭─┬╴Cass
  ╭─┤ ╰╴Smad
─┼─┤ ╰╴Dpon
  ╰╴Sory
 ├╴Tcas
 ╰╴new_child2
 ╭╴Agra
    ╭─┬╴Cass
  ╭─┤ ╰╴Smad#
─┼─┤ ╰╴Dpon
  ╰╴Sory
 ├╴Tcas
 ╰╴new_child2
(ete4) kosukesano@at138:~/tools/for_ETE/test_241101$

また、これでできたnew_tree.nwkはこんな感じ

(Agra:0.173734,(((Cass:0.204166,Smad#:0.219138):0.0351109,Dpon:0.464555):0.0267139,Sory:0.350028):0.0524272,Tcas:0.548018);

葉に#をつけるだけならこれでいいか?

1105

マダラを抜いてオジロを入れた6種でのIQTREE結果

~/tools/for_IQTREE/241104_5sp_plusOjiro以下にrun.nex.treefileができていた

### run.nex.treefileの中身

(Agra:0.2094936641,(Cass:0.1912631030,(Dpon:0.2342863798,Ojir:0.1338157492)86:0.0164703012)100:0.0204609199,(Sory:0.2171214042,Tcas:0.5310501590)100:0.0698739346);

合計10種でのOrthofinder結果

kosukesano@at139:~/tools/for_orthofinder$ ls
241019_6plusOjiro                    Orthofinder_241019.sh.po27076911                          Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest_CDS.sh.o26291666
241104_10sp                          RemakeHedder_6sp                                          Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest_CDS.sh.o26310331
241104_5plusOjiro                    Smad_Agra_Cass_Dpon_Sory_Tcas_CDS_dir                     Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest_CDS.sh.pe26291666
CO1_6sp                              Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir              Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest_CDS.sh.pe26310331
Orthofinder_240917_RH.sh             Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir                   Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest_CDS.sh.po26291666
Orthofinder_240917_RH.sh.e26802366   Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest.sh                Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest_CDS.sh.po26310331
Orthofinder_240917_RH.sh.o26802366   Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest.sh.e26224546      make_philo_tree
Orthofinder_240917_RH.sh.pe26802366  Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest.sh.o26224546      seven_sp.sh
Orthofinder_240917_RH.sh.po26802366  Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest.sh.pe26224546     seven_sp.sh.e26639936
Orthofinder_241019.sh                Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest.sh.po26224546     seven_sp.sh.o26639936
Orthofinder_241019.sh.e27076911      Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest_CDS.sh            seven_sp.sh.pe26639936
Orthofinder_241019.sh.o27076911      Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest_CDS.sh.e26291666  seven_sp.sh.po26639936
Orthofinder_241019.sh.pe27076911     Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest_CDS.sh.e26310331
kosukesano@at139:~/tools/for_orthofinder$ ls 241104_10sp/OrthoFinder/Results_Nov04/
Citation.txt                     Gene_Trees            Orthogroups                            Phylogenetically_Misplaced_Genes  Single_Copy_Orthologue_Sequences
Comparative_Genomics_Statistics  Log.txt               Orthologues                            Putative_Xenologs                 Species_Tree
Gene_Duplication_Events          Orthogroup_Sequences  Phylogenetic_Hierarchical_Orthogroups  Resolved_Gene_Trees               WorkingDirectory
kosukesano@at139:~/tools/for_orthofinder$ 

できてそう

系統樹のファイルを見てみる

### SpeciesTree_rooted.txtの中身

(Tcas:0.176472,((Agra:0.174713,(Cass:0.172946,((Ojiro:0.111109,Smad:0.12931)0.196244:0.0198427,(Pstr:0.302189,(Dfro:0.104938,(Dpon:0.0337761,Dval:0.0588316)0.422295:0.0411228)0.785665:0.139116)0.229497:0.0481502)0.113887:0.0180141)0.171089:0.0194423)0.502757:0.0371578,Sory:0.190036)1:0.176472);

マダラとオジロがクレードを作っちゃってるな……。

合計10種でのIQTREE

~/tools/for_IQTREE/241105_10spを作成、その下でIQTREE_1.pyを実行した。

kosukesano@at139:~/tools/for_IQTREE/241105_10sp$ ls
IQTREE_1.py
kosukesano@at139:~/tools/for_IQTREE/241105_10sp$ python IQTREE_1.py 
      Orthogroup                Agra              Cass            Dfro                Dpon  ...           Ojiro             Pstr            Smad                Sory                Tcas
12598  OG0012598  Agra_P_050292700.1  Cass_AG9761214.1  Dfro_g15387.t1  Dpon_P_019755574.2  ...   Ojir_g1996.t1  Pstr_g201064.t1   Smad_g6358.t1  Sory_P_030761209.1  Tcas_P_008195282.1
12600  OG0012600  Agra_P_050292798.1  Cass_AG9770235.1   Dfro_g4791.t1  Dpon_P_019769671.2  ...  Ojir_g10006.t1  Pstr_g112651.t1  Smad_g12750.t1  Sory_P_030747529.1     Tcas_P_971970.1
12601  OG0012601  Agra_P_050292813.1  Cass_AG9770251.1   Dfro_g4797.t1  Dpon_P_019769634.2  ...  Ojir_g10010.t1  Pstr_g123863.t1   Smad_g5261.t1  Sory_P_030747567.1     Tcas_P_968688.1
12602  OG0012602  Agra_P_050292817.1  Cass_AG9770190.1   Dfro_g4800.t1  Dpon_P_019769690.1  ...  Ojir_g10011.t1  Pstr_g123862.t1   Smad_g5262.t1  Sory_P_030747568.1     Tcas_P_968766.1
12603  OG0012603  Agra_P_050292879.1  Cass_AG9762270.1  Dfro_g12395.t1  Dpon_P_019773117.1  ...   Ojir_g9721.t1   Pstr_g92262.t1  Smad_g12600.t1  Sory_P_030759522.1     Tcas_P_972888.1
...          ...                 ...               ...             ...                 ...  ...             ...              ...             ...                 ...                 ...
13550  OG0013550  Agra_P_050316219.1  Cass_AG9762145.1  Dfro_g12323.t1  Dpon_P_019755290.1  ...   Ojir_g5306.t1  Pstr_g188737.t1    Smad_g695.t1  Sory_P_030763497.1     Tcas_P_974991.1
13553  OG0013553  Agra_P_050316250.1  Cass_AG9766496.1   Dfro_g5342.t1  Dpon_P_019767539.2  ...   Ojir_g3020.t1  Pstr_g200977.t1   Smad_g4018.t1  Sory_P_030760091.1     Tcas_P_972970.1
13554  OG0013554  Agra_P_050316281.1  Cass_AG9761564.1  Dfro_g15225.t1  Dpon_P_019772888.1  ...   Ojir_g1909.t1  Pstr_g200926.t1   Smad_g6322.t1  Sory_P_030758243.1     Tcas_P_967054.1
13557  OG0013557  Agra_P_050316372.1  Cass_AG9765979.1  Dfro_g12508.t1  Dpon_P_019758814.2  ...   Ojir_g9692.t1   Pstr_g37198.t1    Smad_g465.t1  Sory_P_030763414.1     Tcas_P_971352.1
13558  OG0013558  Agra_P_050316407.1  Cass_AG9762302.1  Dfro_g12493.t1  Dpon_P_019758828.1  ...   Ojir_g9703.t1   Pstr_g98775.t1   Smad_g9572.t1  Sory_P_030763403.1  Tcas_P_008196032.1

[466 rows x 11 columns]
kosukesano@at139:~/tools/for_IQTREE/241105_10sp$ ls

続いてconcatinate.shを作成し実行した。

kosukesano@at139:~/tools/for_IQTREE/241105_10sp$ nano concatinate.sh
kosukesano@at139:~/tools/for_IQTREE/241105_10sp$ sh concatinate.sh 
start at
Tue Nov  5 10:26:19 JST 2024
Tue Nov  5 10:26:21 JST 2024
kosukesano@at139:~/tools/for_IQTREE/241105_10sp$ ls
IQTREE_1.py  all_seq.fa  concatinate.sh
kosukesano@at139:~/tools/for_IQTREE/241105_10sp$

続いてIQTREE_2.pyを作り、実行した。実行時のコマンドは以下の通り。

python IQTREE_2.py all_seq.fa ../../for_orthofinder/241104_10sp/OrthoFinder/Results_Nov04/ManualPhylo_data/OG_list.txt 

続いてMPT環境に入る。

source ~/tools/pyenv_env/ManualPhilo_profile 

続いて~/tools/for_orthofinder/241104_10sp/OrthoFinder/Results_Nov04/ManualPhylo_data/に移動、OG_list.txtがあることを確認。

(MPT) kosukesano@at138:~/tools/for_orthofinder/241104_10sp/OrthoFinder/Results_Nov04/ManualPhylo_data$ ls OG_list.txt 
OG_list.txt
(MPT) kosukesano@at138:~/tools/for_orthofinder/241104_10sp/OrthoFinder/Results_Nov04/ManualPhylo_data$

このディレクトリでalign.shを作成、実行する。

続いて~/tools/for_IQTREE/241105_10spに移動し、IQTREE_3.pyを作成・実行した。

これにより*.maftted.trimed.edit.faファイルができたので、続いてmakerun.pyを実行した。

これによりrun.nexができたので、これを使ってmanualphylo.shqsubで投げて完了!

ついでにmakealltree.shも投げてASTRALの前準備をしておく。

PANTHERの構築

牧野先生に紹介いただいたサイトからPANTHER19.0_hmmscoring.tgzをダウンロード、遺伝研に~/tools/for_pantherを作成しコピーした。 これを以下のコマンドで解凍

tar -tzvf PANTHER19.0_hmmscoring.tgz

マダラ含む6種の昆虫ゲノムを用いたCAFE、マダラで有意に減少した遺伝子

down_Bc=read.csv("/Users/kosukesano/bio/for_cafe/241007_cafe_original_data/useIQTREE/Base_change.tab", sep="\t")
down_lines = readLines("/Users/kosukesano/bio/for_cafe/241007_cafe_original_data/useIQTREE/Base_asr.tre")

# TREESセクションのみを抽出する
trees_start <- which(grepl("BEGIN TREES;", down_lines))
trees_end <- which(grepl("END;", down_lines))
down_trees_lines <- lines[(trees_start + 1):(trees_end - 1)]


# 不要なスペースを削除
down_trees_lines <- gsub("^\\s+|\\s+$", "", down_trees_lines)

# データフレームに変換
library(tibble)
down_trees_df = tibble(Tree= down_trees_lines)


down_ex=down_trees_df|>###マダラで優位に増減したOGのOG番号を抽出したファイル
  #lines|> 
  tidyr::separate(Tree, into = c("OG_num", "tree"), sep = r"(\s=\s)")|>#系統樹の文字列をOG番号の列とツリーの列に分割
  dplyr::mutate(OG_num = stringr::str_extract(OG_num, "OG\\d+")) |>#OG番号の列の余計な文字を除去
  dplyr::mutate(Dpon = stringr::str_detect(tree, pattern="Dpon<0>\\*_")) |>#有意な差のある遺伝子ファミリー(*がついてるやつ)を検出
  dplyr::mutate(Cass= stringr::str_detect(tree, pattern="Cass<1>\\*_")) |>#有意な差のある遺伝子ファミリー(*がついてるやつ)を検出
  dplyr::mutate(Tcas = stringr::str_detect(tree, pattern="<2>\\*_")) |>
  dplyr::mutate(Sory = stringr::str_detect(tree, pattern="<3>\\*_")) |>
  dplyr::mutate(Smad = stringr::str_detect(tree, pattern="<4>\\*_")) |>
  dplyr::mutate(Cass_Dpon = stringr::str_detect(tree, pattern="<5>\\*_")) |>
  dplyr::mutate(Sory_Tcas = stringr::str_detect(tree, pattern="<6>\\*_")) |>
  dplyr::mutate(Cass_Smad = stringr::str_detect(tree, pattern="<7>\\*_")) |>
  dplyr::mutate(Agra = stringr::str_detect(tree, pattern="<8>\\*_")) |>
  dplyr::mutate(all = stringr::str_detect(tree, pattern="<9>\\*_")) |>
  dplyr::select(!c(tree)) 

down_Smad_ex = down_ex|>
  dplyr::filter(Smad == "TRUE")|>
    dplyr::filter(Cass_Smad == "FALSE")|>
  dplyr::filter(Cass == "FALSE")|>
  dplyr::filter(Cass_Dpon == "FALSE")|>
  dplyr::filter(Dpon == "FALSE")|>
    dplyr::filter(Agra == "FALSE")|>
  dplyr::filter(Sory_Tcas == "FALSE")|>
  dplyr::filter(Sory == "FALSE")|>
  dplyr::filter(Tcas == "FALSE")|>
  dplyr::filter(all == "FALSE") 

down_Smad_bc=down_Bc |>###マダラで増加した0Gの0G番号を抽出したファイル
  dplyr::select("FamilyID","Smad.4.") |>#OG番号の列とマダラでの遺伝子数の増減量が書いてある列のみを抽出
  dplyr::mutate(Smad.4.= stringr::str_extract(Smad.4., r"(^-\d+)")) |>#マダラの遺伝子量増減の列のうち、-がついているものを抽出
  tidyr::drop_na()|>
  dplyr::filter(Smad.4. != 0) 

down_Smad_df=dplyr::inner_join(down_Smad_bc, down_Smad_ex, by = c(FamilyID = "OG_num"))

# Orthogroups.tsvの読み込み
orthogroups <- ### OG番号とそれに対応するマダラ遺伝子IDのファイル
  read.delim("/Users/kosukesano/bio/for_cafe/0930_orthofinder_data/Orthogroups.tsv", header=FALSE, sep="\t", 
             #stringsAsFactors=FALSE,
             #col.names = "Data"
             skip=1
  )|>
  dplyr::select("V1", "V5")

down_Smad_df2=dplyr::left_join(down_Smad_df, orthogroups, by = c(FamilyID = "V1"))|>#マダラで有意に増加したOGのOG番号とマダラの遺伝子IDを紐付ける
  dplyr::select(!c(Smad.4.)) |>
  print()
   FamilyID  Dpon  Cass  Tcas  Sory Smad Cass_Dpon Sory_Tcas Cass_Smad  Agra
1 OG0000148 FALSE FALSE FALSE FALSE TRUE     FALSE     FALSE     FALSE FALSE
2 OG0000508 FALSE FALSE FALSE FALSE TRUE     FALSE     FALSE     FALSE FALSE
3 OG0000510 FALSE FALSE FALSE FALSE TRUE     FALSE     FALSE     FALSE FALSE
4 OG0001470 FALSE FALSE FALSE FALSE TRUE     FALSE     FALSE     FALSE FALSE
5 OG0002251 FALSE FALSE FALSE FALSE TRUE     FALSE     FALSE     FALSE FALSE
    all                           V5
1 FALSE                Smad_g9618.t1
2 FALSE   Smad_g861.t1, Smad_g861.t2
3 FALSE Smad_g5018.t1, Smad_g5018.t2
4 FALSE                Smad_g8879.t1
5 FALSE               Smad_g12072.t1

1107

オジロのBUSCO続き

忘れてた

~/tools/for_braker/Ojiro/gputest/braker/RemakeHedder_OjiroOjiro_busco.sh.e27077009を見てみる

### Ojiro_busco.sh.e27077009の中身

ERROR:  Please do not provide a full path in --out parameter, no slash. Use out_path in the config.ini file to specify the full path.
ERROR:  BUSCO analysis failed !
ERROR:  Check the logs, read the user guide (https://busco.ezlab.org/busco_userguide.html), and check the BUSCO issue board on https://gitlab.com/ezlab/busco/issues

どうも出力ファイルは絶対・相対パスともに受け付けてもらえず、ファイル名だけをポンと指定しなければいけないらしい。

Ojiro_busco.shを書き換え、qsubで投げた。

#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 16
echo start at
date


singularity exec -e /usr/local/biotools/b/busco:5.1.3--pyhdfd78af_0 busco\
        -m protein\
        -i /home/kosukesano/tools/for_braker/Ojiro/gputest/braker/RemakeHedder_Ojiro/Ojiro.fasta\
        -o BUSCO_OUTPUT_Ojiro\
        -l\
        /home/kosukesano/old_envilonment_until20240430/busco_downloads/busco_downloads/lineages/arthropoda_odb10/\
        -f

date

結果

# BUSCO version is: 5.1.3 
# The lineage dataset is:  (Creation date: 2024-01-08, number of genomes: 90, number of BUSCOs: 1013)
# Summarized benchmarking in BUSCO notation for file /home/kosukesano/tools/for_braker/Ojiro/gputest/braker/RemakeHedder_Ojiro/Ojiro.fasta
# BUSCO was run in mode: proteins

        ***** Results: *****

        C:98.8%[S:80.4%,D:18.4%],F:0.2%,M:1.0%,n:1013      
        1000    Complete BUSCOs (C)                        
        814     Complete and single-copy BUSCOs (S)        
        186     Complete and duplicated BUSCOs (D)         
        2       Fragmented BUSCOs (F)                      
        11      Missing BUSCOs (M)                         
        1013    Total BUSCO groups searched                

Dependencies and versions:
        hmmsearch: 3.1

めっちゃ高いじゃん

PANTHER続き

この前の重たいファイルの解凍が終わった。

続いて、他のファイルをwgetで得る。

wget http://data.pantherdb.org/ftp/hmm_scoring/current_release/pantherScore2.2/lib/

続いてhmmerをダウンロードして解凍。ダウンロード元はこちら

~/tools/for_panther/hmmer-3.1b2に入り、以下のコマンドを実行してビルド

.configure
make

そうするとこうなる

kosukesano@at139:~/tools/for_panther/hmmer-3.1b2$ ls
COPYRIGHT  LICENSE   Makefile.in  Userguide.pdf  config.guess  config.status  configure     documentation  install-sh     profmark       src        tutorial
INSTALL    Makefile  README       aclocal.m4     config.log    config.sub     configure.ac  easel          libdivsufsort  release-notes  testsuite
kosukesano@at139:~/tools/for_panther/hmmer-3.1b2$ 

EDTA_profilehmmerのパスを書いた。

PATH=$PATH:/home/kosukesano/tools/for_panther/hmmer-3.1b2/src

実際にPANTHERを動かしてみる

(EDTA2) kosukesano@at138:~/tools/for_panther/pantherScore2.2$ perl pantherScore2.2.pl -l ../target/famlib/rel/PANTHER19.0_altVersion/hmmscoring/PANTHER19.0/ -D B -V -i  ../test.fa -o ../output.
txt -n
pantherScore2.2.pl starts at Thu Nov  7 19:49:08 2024
__________________________________________________
Verbose level is high.
Input fasta file is: ../test.fa
Display Type: B
library: ../target/famlib/rel/PANTHER19.0_altVersion/hmmscoring/PANTHER19.0/
Output file is: ../output.txt
__________________________________________________
pantherScore2.2.pl ends at Thu Nov  7 19:50:21 2024

結果はこう。

### ~/tools/for_panther/output.txtの中身

SM1_g915.t1     PTHR46564:SF1   TRANSPOSASE     1.1e-17 66.9    88-320
SM1_g915.t1     PTHR46564       TRANSPOSASE     1.1e-17 66.9    88-320
SM1_g915.t2     PTHR46564       TRANSPOSASE     1.1e-17 66.9    109-341
SM1_g915.t2     PTHR46564:SF1   TRANSPOSASE     1.1e-17 66.9    109-341
g9808.t1        PTHR23022:SF119 TC1-LIKE TRANSPOSASE DDE DOMAIN-CONTAINING PROTEIN      9.4e-16 61.4    62-327

できてそうじゃね?

1108

PANTHER続き

~/tools/for_panther/pantherScore2.2pantherScore2.2.plを一部書き換えた。

書き換え前

# necessary libraries
use lib 'lib';
use FamLibBuilder;
use FastaFile;
use strict;
use FileHandle;

# necessary libraries
#use lib 'lib';
use lib '/home/kosukesano/tools/for_panther/pantherScore2.2/lib';
use FamLibBuilder;
use FastaFile;
use strict;
use FileHandle;

こうしたのちに、~/tools/for_panther/working_dir/241108_Madaraディレクトリでpanther_Madara.shを作成、実行した。

#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 6
#$ -l s_vmem=12G
#$ -l mem_req=12G

echo start at
date

source /home/kosukesano/tools/pyenv_env/EDTA_profile

perl /home/kosukesano/tools/for_panther/pantherScore2.2/pantherScore2.2.pl\
        -l /home/kosukesano/tools/for_panther/target/famlib/rel/PANTHER19.0_altVersion/hmmscoring/PANTHER19.0/\
        -D B\
        -V\
        -i /home/kosukesano/tools/for_panther/working_dir/nama_data/Smad.fasta\
        -o output.txt\
        -n

date

MCOを使ったETEのテスト

~/tools/for_ETE/test_241108を作成、そこにglucose dehydrogenase遺伝子をコードする遺伝子ファミリーであるOG0000769の系統樹をコピーする。系統樹はヘッダー変更後のOrthofinder出力のものを使った

(EDTA2) kosukesano@at138:~/tools/for_ETE/test_241108$ cp ~/tools/for_orthofinder/RemakeHedder_6sp/OrthoFinder/Results_Sep19/Gene_Trees/OG0000769_tree.txt ../test_241108
(EDTA2) kosukesano@at138:~/tools/for_ETE/test_241108$ ls
OG0000769_tree.txt
(EDTA2) kosukesano@at138:~/tools/for_ETE/test_241108$ 

続いてconcatinate.shRenamehedder_6sp.fastaファイルを全て結合したファイルを用意する。

続いて OG0000769の遺伝子IDの.txtファイルを作る。

OG0000769: Agra_P_050303580.1 Agra_P_050303581.1 Cass_AG9768182.1 Dpon_P_019767992.2 Dpon_P_048517880.1 Dpon_P_048518052.1 Smad_g2479.t1 Smad_g3654.t1 Smad_g3654.t2 Smad_g3655.t1 Smad_g3656.t1 Smad_g4097.t1 Smad_g4916.t1 Smad_g4916.t2 Sory_P_030758496.1 Sory_P_030758497.1 Sory_P_030758499.1 Sory_P_030758501.1 Tcas_P_968478.1

これはOrthofinder出力のOrthogroups.txtから引っ張ってきた。

これを使ってIQTREE_2.pyを実行。

(EDTA2) kosukesano@at138:~/tools/for_ETE/test_241108$ python IQTREE_2.py all_seq.fa OG0000769.txt 
>Agra_P_050303580.1
MSFHACGCETTWVNPSIADTCSGNQYVVFMTLVDMLIRYACKISDPCGRIIPKTQPAAQYDFIVIGAGSGGSTIAGRLAEVNEWNTLLLEAGMDEPPATQIPAVPAFTNTLIDWNFTTQQESGACLSSNGICSWPRGKVLGGSSVFNGMMYMRGTPADYQRWVDAGNTEWSYDDLLPVFKASEGNRQVGSLVDEKYHGTKGPFTIQQFNSHPKLAEDILIAANQSGWPVSNDLNGDQFVGFAIAQTNNRDGARLSLAKAFVRPHKNNDNFDVMINSTVTKILIEGDGDNKRAYGVEFVYNGTTYTVNATKEVILAAGAVQTPQILLLSGIGPKEELDAVNIEQVHNLTGVGKGIKNHVSFSIVGTINETDVVDLNDESLAQYLSKGTGPLSGTGMSQLTARIPSNYTSPDDPDIQLFFSGMSNTCAYSGLPGLPTDPEDPSALRVLSIACVNLHPKSSGQISLLSNNPLDPPKIVANYFNHSDDIKVVLAGVRIAQKLMQSKIMQEKYNFTLQQYDYGNCSSLYEFDTDDFWECAIRYDTYPENHQSASCKIAPQSNEEACVNQRLQVYGISNLRITDASVIYTPTSGNIQAIIVAIAERASQFIREDYGIDSQI

.
.
.
.
.
.
.

>Tcas_P_968478.1
MSCCANEPYIGPPLDRTCFGGSYIVFMHLLNTLITQQCDVSEICQRINPQLQPDSEYDFVVIGGGAGGSVVAGRLSENPNWKILLIEAGGDEPPGSQVPSMMNNYLGDSQMDWRYRTEPQEMACLGRPGRRCDWPRGRVLGGSGVIHGMMYMRGLPSDYNEWEARGNEGWGYKDVEEYFKKSEGNRDIGDGVEGRYHSSDGPMLVQRFPDQPQIAEDVLRAGAELGYPVVGDLNGEQHWGFTIAQANIKNGSRLSSARAFLRPARNRPNLHVMINSTATKILINSNDTAKTISAVEFTYNNQSFTVKVRREAIVSAGAINTPHLLLLSGIGPREELDKVGIEQVHNLPGVGQNLKNHVSFAVNFQLTKIENYNDLNWNTVREYLTERRGPMSSTGVTQVAARISSKYANPDGKNPDLQFFFSGFLAHCSLSGGVKEPEDPTNPTAAKSFTIRPTFLRPRSRGFIGLNSRDPKEPPLMQPNYLTDEEDVKRMVAGIRIAQNLANTTILTTKYGIQMVNTDYGDCSRNYTFDSDEFWACALRYDTGPENHQSCSCKMGPASDPSAVVDPKLQVHGIEGLRIMDASVMPTVLSGNTHATVVMIAEKGSDYIKQKWSDK

(EDTA2) kosukesano@at138:~/tools/for_ETE/test_241108$ ls
IQTREE_2.py  OG0000769.txt  OG0000769:  OG0000769_tree.txt  all_seq.fa  concatinate.sh
(EDTA2) kosukesano@at138:~/tools/for_ETE/test_241108$ less OG0000769:
(EDTA2) kosukesano@at138:~/tools/for_ETE/test_241108$ mv OG0000769: OG0000769.fasta
(EDTA2) kosukesano@at138:~/tools/for_ETE/test_241108$ ls
IQTREE_2.py  OG0000769.fasta  OG0000769.txt  OG0000769_tree.txt  all_seq.fa  concatinate.sh
(EDTA2) kosukesano@at138:~/tools/for_ETE/test_241108$ 

出力ファイルがOG0000769:とかいう変な名前だったので変更もしておいた。

この後にMPT環境に入り、align.shを実行。この時、OG0000769:のままでもよかった

1109

重信先生のマダラゲノムを使用したpanther

(MPT) kosukesano@at138:~/tools/for_braker/Sigenobu_Madara$ ls
Release_240921-SmiMad_GenePrediction_GM1.zip
(MPT) kosukesano@at138:~/tools/for_braker/Sigenobu_Madara$ unzip Release_240921-SmiMad_GenePrediction_GM1.zip 
Archive:  Release_240921-SmiMad_GenePrediction_GM1.zip
warning:  stripped absolute path spec from /
mapname:  conversion of  failed
 extracting: README.md               
 extracting: .Rhistory               
 extracting: braker.SmiMad_GM1.gff   
 extracting: braker.SmiMad_GM1.gff.t2g.txt  
 extracting: braker.SmiMad_GM1.gff.aa.fasta  
 extracting: braker.SmiMad_GM1.gff.cds.fasta  
 extracting: braker.SmiMad_GM1.gff.longest.tsv  
 extracting: braker.SmiMad_GM1.gff.nr.aa.fasta  
 extracting: braker.SmiMad_GM1.gff.nr.cds.fasta  
 extracting: braker.SmiMad_GM1.gff.transcript.fasta  
(MPT) kosukesano@at138:~/tools/for_braker/Sigenobu_Madara$ ls
README.md                                     braker.SmiMad_GM1.gff           braker.SmiMad_GM1.gff.cds.fasta    braker.SmiMad_GM1.gff.nr.aa.fasta   braker.SmiMad_GM1.gff.t2g.txt
Release_240921-SmiMad_GenePrediction_GM1.zip  braker.SmiMad_GM1.gff.aa.fasta  braker.SmiMad_GM1.gff.longest.tsv  braker.SmiMad_GM1.gff.nr.cds.fasta  braker.SmiMad_GM1.gff.transcript.fasta
(MPT) kosukesano@at138:~/tools/for_braker/Sigenobu_Madara$ ls braker.SmiMad_GM1.gff.cds.fasta 
braker.SmiMad_GM1.gff.cds.fasta
(MPT) kosukesano@at138:~/tools/for_braker/Sigenobu_Madara$ less braker.SmiMad_GM1.gff.cds.fasta 
(MPT) kosukesano@at138:~/tools/for_braker/Sigenobu_Madara$ less README.md 
(MPT) kosukesano@at138:~/tools/for_braker/Sigenobu_Madara$ cp braker.SmiMad_GM1.gff.cds.fasta ~/tools/for_panther/working_dir/nama_data/
(MPT) kosukesano@at138:~/tools/for_braker/Sigenobu_Madara$ cd ~/tools/for_panther/working_dir/
(MPT) kosukesano@at138:~/tools/for_panther/working_dir$ ls
241108_Madara  241108_test  nama_data
(MPT) kosukesano@at138:~/tools/for_panther/working_dir$ mkdir 241109_Sigenobu_Madara
(MPT) kosukesano@at138:~/tools/for_panther/working_dir$ cd 241109_Sigenobu_Madara/
(MPT) kosukesano@at138:~/tools/for_panther/working_dir/241109_Sigenobu_Madara$ cp ../241108_Madara/panther_Madara.sh panther_Sigenobu.sh
(MPT) kosukesano@at138:~/tools/for_panther/working_dir/241109_Sigenobu_Madara$ nano panther_Sigenobu.sh 
(MPT) kosukesano@at138:~/tools/for_panther/working_dir/241109_Sigenobu_Madara$ qsub panther_Sigenobu.sh 
Your job 27239810 ("panther_Sigenobu.sh") has been submitted
(MPT) kosukesano@at138:~/tools/for_panther/working_dir/241109_Sigenobu_Madara$ qstat
job-ID     prior   name       user         state submit/start at     queue                          jclass                         slots ja-task-ID 
------------------------------------------------------------------------------------------------------------------------------------------------
  27235021 0.25410 panther_Ma kosukesano   r     11/08/2024 17:10:29 gpu.q@igt011                                                      6        
  27234958 0.25331 QLOGIN     kosukesano   r     11/08/2024 16:32:10 login.q@at138                                                     1        
  27239810 0.00000 panther_Si kosukesano   qw    11/09/2024 10:53:28                                                                  16        
(MPT) kosukesano@at138:~/tools/for_panther/working_dir/241109_Sigenobu_Madara$ 

1111

PANTHERの結果

自分のマダラの方のPANTHERが終わってたので見てみる

### ~/tools/for_panther/working_dir/241108_Madara/output.txtの中身の一部

      1 Smad_g9309.t1   PTHR23226:SF416 FI01424P        2.8e-43 151.2   132-521
      2 Smad_g10140.t29 PTHR23110:SF111 LONGITUDINALS LACKING PROTEIN, ISOFORMS F_I_K_T 1.6e-103        350.5   1-403
      3 Smad_g6717.t2   PTHR47958:SF73  RNA HELICASE    7.1e-265        884.7   63-514
      4 Smad_g13408.t1  PTHR24394       ZINC FINGER PROTEIN     5.6e-131        441.5   49-482
      5 Smad_g1852.t1   PTHR10009:SF7   GH10609P-RELATED        2.3e-156        523.7   6-419
      6 Smad_g11009.t1  PTHR45703:SF32  DYNEINS HEAVY CHAIN     0       6045.5  1-3926
      7 Smad_g6907.t3   PTHR16154:SF6   SPINOPHILIN, ISOFORM J  0       1148.1  1-1768
      8 Smad_g7989.t1   PTHR14710       GEM-ASSOCIATED PROTEIN 6        4e-33   117.8   6-159
      9 Smad_g7989.t1   PTHR14710:SF2   GEM-ASSOCIATED PROTEIN 6        4e-33   117.8   6-159
     10 Smad_g6622.t2   PTHR46763:SF1   DYNEIN REGULATORY COMPLEX PROTEIN 8     4.5e-65 221.7   8-163
     .
     .
     .
     .
     .
       16548 Smad_g5292.t1   PTHR24403       ZINC FINGER PROTEIN     1.6e-128        433.4   93-1274
  16549 Smad_g10689.t1  PTHR12748       ORIGIN RECOGNITION COMPLEX SUBUNIT 3    6.4e-156        524.0   30-697
  16550 Smad_g10665.t1  PTHR12081:SF43  TRANSCRIPTION FACTOR E2F1       3.2e-70 240.5   66-468
  16551 Smad_g9538.t1   PTHR47027       REVERSE TRANSCRIPTASE DOMAIN-CONTAINING PROTEIN 1.3e-12 50.4    6-88
  16552 Smad_g7813.t1   PTHR47611:SF3   HAT C-TERMINAL DIMERISATION DOMAIN-CONTAINING PROTEIN   2.8e-32 115.8   39-234
  16553 Smad_g12219.t1  PTHR45877:SF2   E3 UBIQUITIN-PROTEIN LIGASE SINA-RELATED        1.5e-76 261.0   13-507
  16554 Smad_g5017.t1   PTHR17605:SF0   RIBOSOME BIOGENESIS PROTEIN BOP1        3.3e-285        951.1   103-833
  16555 Smad_g5017.t1   PTHR17605       RIBOSOME BIOGENESIS PROTEIN BOP1  BLOCK OF PROLIFERATION 1 PROTEIN      3.3e-285        951.1   103-833
  16556 Smad_g11653.t1  PTHR43157       PHOSPHATIDYLINOSITOL-GLYCAN BIOSYNTHESIS CLASS F PROTEIN-RELATED        1.4e-88 299.9   9-305
  16557 Smad_g10978.t1  PTHR48021:SF46  MAJOR FACILITATOR SUPERFAMILY (MFS) PROFILE DOMAIN-CONTAINING PROTEIN   2.4e-158        531.1   17-465
  16558 Smad_g8495.t1   PTHR21411:SF0   REGULATORY PROTEIN ZESTE        2.6e-14 56.4    3-144
  16559 Smad_g8495.t1   PTHR21411       APONTIC 2.6e-14 56.4    3-144
  16560 Smad_g10140.t17 PTHR23110:SF111 LONGITUDINALS LACKING PROTEIN, ISOFORMS F_I_K_T 3.1e-93 316.6   1-332
  16561 Smad_g10839.t1  PTHR22999:SF40  PX DOMAIN-CONTAINING PROTEIN KINASE-LIKE PROTEIN        6.4e-205        685.0   1-567

アノテーションがしっかりついてる!

でも3〜4日かかったんだよなあ….。

これをローカルに転送。

output=read.csv("/Users/kosukesano/bio/for_panther/nama_data/241111_Madara_output/output.txt", sep="\t")|>
  tidyr::separate(PANTHER_ID, into = c("PANTHER_ID", "family_ID"), sep = ":")

output|>
  dplyr::count(gene_ID) |>
  tail(n = 5)
            gene_ID n
14784 Smad_g9995.t1 1
14785 Smad_g9996.t1 2
14786 Smad_g9997.t1 1
14787 Smad_g9998.t1 1
14788 Smad_g9999.t1 1
### 14788遺伝子

アウトプットファイルには1遺伝子につき複数のアノテーションがついていた。重複を消すためパターンをカウントすると14788遺伝子あった。

これについて、マッキーさんのトランスポゾンデータセットに該当するものを除去する

tp=read.csv("/Users/kosukesano/bio/for_panther/nama_data/transpsons_data/PANTHER11.0_transpsons.txt", sep="\t")

tp_family=read.csv("/Users/kosukesano/bio/for_panther/nama_data/transpsons_data/PANTHER11.0_transpsons_subfamily.txt", sep="\t")

tp_ID=dplyr::full_join(tp, tp_family, by = "PANTHER_ID")|>
  tidyr::separate(PANTHER_ID, into = c("PANTHER_ID", "family_ID"), sep = ":")|>
  dplyr::mutate(transpozon = stringr::str_replace_all(PANTHER_ID, "^.*.*$", "TRUE"))

tp_madara1 =dplyr::full_join(tp_ID, output, by = "PANTHER_ID")|>
  tidyr::replace_na(list(transpozon="FALSE"))|>
  dplyr::filter(stringr::str_detect(transpozon, "TRUE")==FALSE)|>
  dplyr::count(gene_ID) |>
  tail(n = 5)

tp_madara1
            gene_ID n
14476 Smad_g9995.t1 1
14477 Smad_g9996.t1 2
14478 Smad_g9997.t1 1
14479 Smad_g9998.t1 1
14480 Smad_g9999.t1 1

重複を消すためパターンをカウントすると14480遺伝子あった。

思ったより少なかったので、アノテーション情報を参照にトランスポゾンを除去してみる。

filter= output|>
  dplyr::mutate(transpozon = stringr::str_replace_all(gene_ID, "^.*.*$", "TRUE"))|>
  dplyr::filter(stringr::str_detect(gene_function, "^.*TRANSPOSASE.*$")==TRUE | 
                  stringr::str_detect(gene_function, "^.*TRANSPOSABLE.*$")==TRUE|
                  stringr::str_detect(gene_function, "^.*TRANSCRIPTASE.*$")==TRUE|
                  stringr::str_detect(gene_function, "^.*TRANSPOSON.*$")==TRUE)|>
  dplyr::select(c(gene_ID, PANTHER_ID, transpozon)) 

tp_madara2=dplyr::full_join(filter, output, by = "gene_ID", relationship = "many-to-many")|>
  tidyr::replace_na(list(transpozon="FALSE"))|>
  dplyr::filter(stringr::str_detect(transpozon, "TRUE")==FALSE)|>
  dplyr::count(gene_ID) |>
  tail(n = 5)

tp_madara2
            gene_ID n
14301 Smad_g9995.t1 1
14302 Smad_g9996.t1 2
14303 Smad_g9997.t1 1
14304 Smad_g9998.t1 1
14305 Smad_g9999.t1 1

重複を消すためパターンをカウントすると14305遺伝子あった。

じゃあこの2つの重複部分ってどうなんですか?ということで

filter1=dplyr::full_join(tp_ID, output, by = "PANTHER_ID")|>
  tidyr::replace_na(list(transpozon="FALSE"))|>
  dplyr::filter(stringr::str_detect(transpozon, "TRUE")==TRUE)|>

  dplyr::count(gene_ID) 

filter2= output|>
  dplyr::mutate(transpozon = stringr::str_replace_all(gene_ID, "^.*.*$", "TRUE"))|>
  dplyr::filter(stringr::str_detect(gene_function, "^.*TRANSPOSASE.*$")==TRUE | 
                  stringr::str_detect(gene_function, "^.*TRANSPOSABLE.*$")==TRUE|
                  stringr::str_detect(gene_function, "^.*TRANSCRIPTASE.*$")==TRUE|
                  stringr::str_detect(gene_function, "^.*TRANSPOSON.*$")==TRUE)|>
  dplyr::select(c(gene_ID, transpozon)) |>
  dplyr::count(gene_ID) 

filter3=tp_madara=dplyr::full_join(filter1, filter2, by = "gene_ID", relationship = "many-to-many")|>
  dplyr::select(c(gene_ID)) |>
  dplyr::mutate(transpozon = stringr::str_replace_all(gene_ID, "^.*.*$", "TRUE"))

tp_madara3=dplyr::full_join(filter3, output, by = "gene_ID", relationship = "many-to-many")|>
  tidyr::replace_na(list(transpozon="FALSE"))|>
  dplyr::filter(stringr::str_detect(transpozon, "TRUE")==FALSE)|>
  dplyr::count(gene_ID) |>
  tail(n = 5)

tp_madara3
            gene_ID n
14141 Smad_g9996.t1 2
14142 Smad_g9997.t1 1
14143 Smad_g9998.t1 1
14144 Smad_g9999.t1 1
14145          <NA> 1

重複を消すためパターンをカウントすると14144遺伝子あった。

結構少なめ……。マッキーさんのデータはPANTHER11.0だったけど、自分のはPANTHER19.0なんだよな。これが理由か?

自分のPANTHER19.0のデータベースを見たけど、どれがどれだかわからなかった。マッキーさんに抽出方法を聞かなきゃ。

ちなみにマダラの遺伝子数はこう。

(MPT) kosukesano@at138:~/tools/for_braker/Madara/braker$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat braker.aa 
file       format  type     num_seqs    sum_len  min_len  avg_len  max_len
braker.aa  FASTA   Protein    16,570  8,790,187        5    530.5   20,186
(MPT) kosukesano@at138:~/tools/for_braker/Madara/braker$

1113

MCOを用いたPAMLやり直し

これまでのall_seq.faはアミノ酸配列を参照していた。CDS版のall_seq.faを新たに取得する。

### concatinate.shの中身

#$ -S /bin/bash
#$ -cwd

echo start at
date

# Enter the directory containing the fasta files
filesout="/home/kosukesano/tools/for_paml/data/241009_RemakeHedder_6sp_afterchange"  ## Please replace with the actual directory containing the fasta files

# Define the output directory and output file
new="/home/kosukesano/tools/for_ETE/test_241108"
mkdir -p $new

# Concatenate all fasta files into one file
for file in "$filesout"/*.fasta; do
    cat "$file" >> "${new}/CDS_all_seq.fa"
done

echo end at
date

これで取ったCDS_all_seq.faをもとにOG0000769の配列を取り出し、アライメントする。

続いて~/tools/for_ETE/test_241108/bsA以下でbsA.ctlを実行する

### bsA.ctlの中身

     seqfile = /home/kosukesano/tools/for_ETE/test_241108/OG0000769.maffted.trimed.fa
     treefile = /home/kosukesano/tools/for_ETE/test_241108/OG0000769.nwk
      outfile = /home/kosukesano/tools/for_ETE/test_241108/bsA/result/OG0000769_branch_alt

        noisy = 9   * 0,1,2,3,9: how much rubbish on the screen
      verbose = 1   * 1: detailed output, 0: concise output
      runmode = 0   * 0: user tree;  1: semi-automatic;  2: automatic
                    * 3: StepwiseAddition; (4,5):PerturbationNNI

      seqtype = 1   * 1:codons; 2:AAs; 3:codons-->AAs
    CodonFreq = 2   * 0:1/61 each, 1:F1X4, 2:F3X4, 3:codon table
        clock = 0   * 0: no clock, unrooted tree, 1: clock, rooted tree
        model = 2
                    * models for codons:
                        * 0:one, 1:b, 2:2 or more dN/dS ratios for branches

      NSsites = 2   * dN/dS among sites. 0:no variation, 1:neutral, 2:positive
        icode = 0   * 0:standard genetic code; 1:mammalian mt; 2-10:see below

    fix_kappa = 0   * 1: kappa fixed, 0: kappa to be estimated
        kappa = 2   * initial or fixed kappa
    fix_omega = 0   * 1: omega or omega_1 fixed, 0: estimate
        omega = 1   * initial or fixed omega, for codons or codon-transltd AAs

    fix_alpha = 1   * 0: estimate gamma shape parameter; 1: fix it at alpha
        alpha = .0  * initial or fixed alpha, 0:infinity (constant rate)
       Malpha = 0   * different alphas for genes
        ncatG = 4   * # of categories in the dG or AdG models of rates

        getSE = 0   * 0: don't want them, 1: want S.E.s of estimates
 RateAncestor = 0   * (1/0): rates (alpha>0) or ancestral states (alpha=0)
       method = 0   * 0: simultaneous; 1: one branch at a time
  fix_blength = 0  * 0: ignore, -1: random, 1: initial, 2: fixed, 3: proportional
(MPT) kosukesano@at138:~/tools/for_ETE/test_241108$ nano lrp.py
(MPT) kosukesano@at138:~/tools/for_ETE/test_241108$ python lrp.py 
0.19340151084909837
(MPT) kosukesano@at138:~/tools/for_ETE/test_241108$

有意な差はなかった。

ローカルでのPAML構築

公式Githubサイトを参考に行った。

~/tools/for_pamlで以下を実行

git clone https://github.com/abacus-gene/paml.git
cd paml
cd src
make -f Makefile
rm *.o
mkdir ../bin
mv baseml basemlg chi2 codeml evolver infinitesites mcmctree pamp yn00 ../bin

~/tools/pyenv_env/ETE_profileに以下を追記

export PATH=/home/kosukesano/tools/for_paml/paml/bin:$PATH

結局エラーは別理由っぽかったけど、備忘録として残しておく。

10種でのCAFE

:~/bio/for_cafe$ mkdir 241113_10sp_Orthofinder_data
:~/bio/for_cafe$ cd 241113_10sp_Orthofinder_data/
:~/bio/for_cafe/241113_10sp_Orthofinder_data$
:~/bio/for_cafe/241113_10sp_Orthofinder_data$ scp kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_orthofinder/241104_10sp/OrthoFinder/Results_Nov04/Orthogroups/Orthogroups.GeneCount.tsv
 /Users/kosukesano/bio/for_cafe/241113_10sp_Orthofinder_data
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
|  ..o.o...*   o+ |
|   . . ..= + o* o|
|       .  = oB +.|
|      +.oo .+E+o.|
|      .*S.  o.o.+|
|      .o. .  . .+|
|      ..   +  . o|
|        . ..oo . |
|         . .=.   |
+----[SHA256]-----+
Orthogroups.GeneCount.tsv                                                                                                                                      100% 1254KB  10.5MB/s   00:00    
:~/bio/for_cafe/241113_10sp_Orthofinder_data$

ここで以下の通りにRを実行

### 1113

Orthologs_raw <- read_tsv(paste("/Users/kosukesano/bio/for_cafe/241113_10sp_Orthofinder_data/Orthogroups.GeneCount.tsv", sep = "/"))


##Enzanはorthogroupのなかで遺伝子数が変なやつを検出するためのmatrix
Enzan <- Orthologs_raw %>%
  select(!c(Orthogroup, Total)) %>%
  t()

##saidai, saisyouは各Orthogroupの中で、各種が持っているコピー数の最大値及び最小値を記したdf
saidai <- Enzan %>% 
  apply(2, max) %>%
  as.data.frame() %>%
  rename(max_real = ".")
saisyou <- Enzan %>% 
  apply(2, min) %>%
  as.data.frame() %>%
  rename(min_real = ".")

##Orthologs_1は各Orthogroupsの最大値、最小値もくっつけたdf
Orthologs_1 <- Orthologs_raw %>% select(!c(Total)) %>%
  bind_cols(saidai, saisyou)

##最大値と最小値の差
Orthologs_2 <-Orthologs_1 %>% 
  mutate(sa = max_real - min_real) %>%
  filter(max_real != min_real) %>%
  filter(sa < 50)

##外れ値と遺伝子ファミリー数が全種で共通の行を省いた。最後に1列目を複製し列名をいじって、CAFEへのインプットデータの出来上がり。
Orthologs_3 <- Orthologs_2 %>% 
  mutate(Description = Orthogroup, ID = Orthogroup) %>%
  relocate(Description, ID) %>%
  select(!c(Orthogroup, max_real, min_real, sa))

#Orthologs_3 %>% 
#  write_tsv(paste("/Users/kosukesano/bio/for_cafe/241113_10sp_Orthofinder_data/Orthogroups.GeneCount2.tsv", sep = "/"))#, quote = FALSE) #,row.names = FALSE)

tree = read.tree("/Users/kosukesano/bio/for_cafe/241113_10sp_Orthofinder_data/run.nex.treefile")
mrca = getMRCA(tree, tip=c('Tcas', 'Sory')) #分岐年代推定に使うノードの指定
tree2 = chronopl(
  tree,
  100000,
  age.min = 152.3,  # 推定分岐年代の最小値(MYA)
  age.max = 236.2,  # 推定分岐年代の最大値(MYA)
  node = mrca,   # getMRCAで指定したノード
  S = 1,
  tol = 1e-20,
  CV = FALSE,
  eval.max = 500,
  iter.max = 500
)
is.ultrametric(tree2)  # ultrametricかどうか確認
[1] TRUE
#write.tree(tree2, file = "/Users/kosukesano/bio/for_cafe/241113_10sp_Orthofinder_data/tree_ultrametric.nwk")  # ultrametric系統樹の保存

遺伝研に転送

:~/bio/for_cafe/241113_10sp_Orthofinder_data$ scp /Users/kosukesano/bio/for_cafe/241113_10sp_Orthofinder_data/Orthogroups.GeneCount2.tsv tree_ultrametric.nwk kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_cafe/241113_10sp
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
|  ..o.o...*   o+ |
|   . . ..= + o* o|
|       .  = oB +.|
|      +.oo .+E+o.|
|      .*S.  o.o.+|
|      .o. .  . .+|
|      ..   +  . o|
|        . ..oo . |
|         . .=.   |
+----[SHA256]-----+
Orthogroups.GeneCount2.tsv                                                                                                                                     100% 1467KB  17.5MB/s   00:00    
tree_ultrametric.nwk                                                                                                                                           100%  276    28.7KB/s   00:00    
:~/bio/for_cafe/241113_10sp_Orthofinder_data$ 

CAFE5を実行

kosukesano@at138:~/tools/for_cafe/241113_10sp$ singularity exec -e /usr/local/biotools/c/cafe:5.0.0--h5b5514e_2 cafe5 -i Orthogroups.GeneCount2.tsv -t tree_ultrametric.nwk

Command line: /usr/local/bin/cafe5 -i Orthogroups.GeneCount2.tsv -t tree_ultrametric.nwk 

Filtering families not present at the root from: 37427 to 9286

No root family size distribution specified, using uniform distribution

Optimizer strategy: Nelder-Mead with similarity cutoff
Iterations: 300
Expansion: 2
Reflection: 1

Starting Search for Initial Parameter Values
Lambda: 0.0015657989564863
Score (-lnL): 195030.33196575
Lambda: 0.0015657989564863
Score (-lnL): 195030.33196575
Lambda: 0.0016440889043106
.
.
.
.
.
.
Lambda: 0.0016435728328772
Score (-lnL): 194986.33229339

Completed 21 iterations
Time: 0H 0M 4S
Best match is: 0.001643591946634
Final -lnL: 194986.33229273

40 values were attempted (0% rejected)

Inferring processes for Base model
Score (-lnL): 194986.33229273
Maximum possible lambda for this topology: 0.0032212076114385
Computing pvalues...
done!

Starting reconstruction processes for Base model
Done!

kosukesano@at138:~/tools/for_cafe/241113_10sp$ 

1115

配列長が最も長いIsoformの抽出、そのためのモジュール導入

高川くんが作ってくれたfaspというモジュールで最長のアイソフォームだけ取り出せるようなので、これを導入

kosukesano@at138:~/tools$ mkdir for_isoform_ex
kosukesano@at138:~/tools$ cd for_isoform_ex/
kosukesano@at138:~/tools/for_isoform_ex$ ls
kosukesano@at138:~/tools/for_isoform_ex$ 

tool下にfor_isoform_exディレクトリを作成。

kosukesano@at138:~/tools/for_isoform_ex$ python3 -m venv fasp
kosukesano@at138:~/tools/for_isoform_ex$ ls
fasp
kosukesano@at138:~/tools/for_isoform_ex$ source ~/tools/for_isoform_ex/fasp/bin/activate
(fasp) kosukesano@at138:~/tools/for_isoform_ex$

fasp用のvenv環境を作成、それを立ち上げる。その時の立ち上げコマンドはsource ~/tools/for_isoform_ex/fasp/bin/activate

(fasp) kosukesano@at138:~/tools/for_isoform_ex$ pip3 install git+https://github.com/tamasakian/fasp.git
Collecting git+https://github.com/tamasakian/fasp.git
  Cloning https://github.com/tamasakian/fasp.git to /tmp/pip-req-build-kkooafn8
  Running command git clone --filter=blob:none --quiet https://github.com/tamasakian/fasp.git /tmp/pip-req-build-kkooafn8
  Resolved https://github.com/tamasakian/fasp.git to commit 64f590e29f3b8bbd8432bf851187ffea29d0a235
  Installing build dependencies ... done
  Getting requirements to build wheel ... done
  Preparing metadata (pyproject.toml) ... done
Collecting biopython
  Using cached biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
Collecting numpy
  Using cached numpy-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.3 MB)
Building wheels for collected packages: fasp
  Building wheel for fasp (pyproject.toml) ... done
  Created wheel for fasp: filename=fasp-0.0.1-py3-none-any.whl size=7776 sha256=35195170b52f87867e0cd5059a4e12b655fddd90be32b9ec107812ef812f07af
  Stored in directory: /tmp/pip-ephem-wheel-cache-11owkspy/wheels/24/bb/a8/5c34a8384cbe3415028754571c2d0015f2486e23528fce65d2
Successfully built fasp
Installing collected packages: numpy, biopython, fasp
Successfully installed biopython-1.84 fasp-0.0.1 numpy-2.1.3
(fasp) kosukesano@at138:~/tools/for_isoform_ex$ type python3
python3 is /lustre7/home/kosukesano/tools/for_isoform_ex/fasp/bin/python3
(fasp) kosukesano@at138:~/tools/for_isoform_ex$ 

実行のコマンドはこんな感じ

(fasp) kosukesano@at138:~/tools/for_isoform_ex$ python3 -m fasp exclude_isoforms_by_length nama_data/Tcas.faa output_data/Tcas_iso1.faa nama_data/Tcas.gff
(fasp) kosukesano@at138:~/tools/for_isoform_ex$ less output_data/Tcas_iso1.faa 

faspのインストール

BRAKER出力ファイルに対する最長Isoformの抽出

/home/kosukesano/tools/for_isoform_extest.pyを作成、実行した。

#!/usr/bin/env python3

"""Library for processing protein FASTA files.

Functions
---------
exclude_isoforms_by_length: Exclude isoforms based on length.
exclude_non_nuclear_proteins: Exclude mitochondrial and chloroplast proteins.

"""

from Bio import SeqIO

def exclude_isoforms_by_length(input_filename: str, output_filename: str, gff3_file: str) -> None:
    """Exclude isoforms based on length.
    
    Args
    ----
    input_filename : str
        Input protein FASTA filename.
    output_filename : str
        Output protein FASTA filename.
    gff3_file : str
        Input genome GFF3 filename.

    """
    
    def parse_gff3(gff3_file: str) -> dict:
        """Parse GFF3 file and make dict with 'protein_id', 'start', 'end' and 'length' of each gene. 

        Args
        ----
        gff3_file : str

        Returns
        -------
        genes : dict
            Dict with 'protein_id', 'start', 'end' and 'length' of each genes. 

        """
        genes = {}
        with open(gff3_file, "r") as gff3_handle:
            for line in gff3_handle:
                ## Exclude comments
                if line.startswith("#"):
                    continue

                li = line.strip().split("\t")
                if len(li) != 9:
                    continue
                ptg_num, tools, kind, start, end, score, strand, phase, attributes = li

                ## Exclude lines other than CDS.
                if kind != "CDS":
                    continue

                ## Handle attributes.
                attr_dict = {}
                for attr in attributes.split("; "):
                    key_value = attr.split(" ")
                    ##print(len(key_value))
                    if len(key_value) != 2:
                        continue
                    key, value = key_value
                    attr_dict[key] = value
                
                ## Exclude CDS without protein_id.
                if "transcript_id" not in attr_dict:
                    print("trans")
                    continue

                ## Exclude CDS without protein_id.
                if "gene_id" not in attr_dict:
                    print("gene_ID")
                    continue

                ## Read information of CDS.
                protein_id = attr_dict["transcript_id"]
                protein_id = protein_id.strip('"')
                print(protein_id)
                if "gene_id" in attr_dict:
                    gene = attr_dict["gene_id"]
                start, end, length = int(start), int(end), int(end) - int(start)
                if gene not in genes:
                    genes[gene] = []
                genes[gene].append({"protein_id": protein_id, "start": start, "end": end, "length": length})

        return genes

    def select_longest_protein(genes: dict) -> dict:
        """Select the longest proteins for each gene based on CDS information.

        Args
        ----
        genes : dict

        Returns
        -------
        longest_proteins : dict

        """
        longest_proteins = {}
        for gene, cds_list in genes.items():
            protein_lengths = {}
            for cds in cds_list:
                protein_id = cds["protein_id"]
                length = cds["length"]
                if protein_id not in protein_lengths:
                    protein_lengths[protein_id] = length
                else:
                    protein_lengths[protein_id] += length
            ## Select the longest protein.
            longest_proteins[gene] = max(protein_lengths, key=protein_lengths.get)

        return longest_proteins

    def slice_proteins(input_filename: str, output_filename: str, longest_proteins: dict) -> None:
        """Slice FASTA file to retain only the longest proteins for each gene.

        Args
        ----
        input_filename : str
        output_filename : str
        longest_proteins : dict

        """
        input_proteins = SeqIO.to_dict(SeqIO.parse(input_filename, "fasta"))
        selected_protein_ids = set(longest_proteins.values())

        output_proteins = []
        for selected_protein_id in selected_protein_ids:
            if selected_protein_id not in input_proteins:
                continue
            output_proteins.append(input_proteins[selected_protein_id])
        
        with open(output_filename, "w") as output_handle:
            SeqIO.write(output_proteins, output_handle, "fasta")

    genes = parse_gff3(gff3_file)
    longest_proteins = select_longest_protein(genes)
    slice_proteins(input_filename, output_filename, longest_proteins)


exclude_isoforms_by_length("nama_data/Madara.aa", "output_data/Madara_iso1.aa", "nama_data/Madara.gtf")

これを使う前にちゃんとfasp環境を起動しておくこと。

全部やるとこんな感じ

(fasp) kosukesano@at138:~/tools/for_isoform_ex$ ls nama_data/
Agra.faa  Agra.gff  Cass.faa  Cass.gff  Dpon.faa  Dpon.gff  Madara.aa  Madara.gtf  Sory.faa  Sory.gff  Tcas.faa  Tcas.gff
(fasp) kosukesano@at138:~/tools/for_isoform_ex$ python3 -m fasp exclude_isoforms_by_length nama_data/Agra.faa output_data/Agra_iso1.faa nama_data/Agra.gff
(fasp) kosukesano@at138:~/tools/for_isoform_ex$ less output_data/Agra_iso1.faa 
(fasp) kosukesano@at138:~/tools/for_isoform_ex$ python3 -m fasp exclude_isoforms_by_length nama_data/Cass.faa output_data/Cass_iso1.faa nama_data/Cass.gff
(fasp) kosukesano@at138:~/tools/for_isoform_ex$ less output_data/Cass_iso1.faa 
(fasp) kosukesano@at138:~/tools/for_isoform_ex$ python3 -m fasp exclude_isoforms_by_length nama_data/Dpon.faa output_data/Dpon_iso1.faa nama_data/Dpon.gff
(fasp) kosukesano@at138:~/tools/for_isoform_ex$ less output_data/Dpon_iso1.faa 
(fasp) kosukesano@at138:~/tools/for_isoform_ex$ python3 -m fasp exclude_isoforms_by_length nama_data/Sory.faa output_data/Sory_iso1.faa nama_data/Sory.gff
(fasp) kosukesano@at138:~/tools/for_isoform_ex$ less output_data/Sory_iso1.faa 
(fasp) kosukesano@at138:~/tools/for_isoform_ex$ less output_data/
output_data/ is a directory
(fasp) kosukesano@at138:~/tools/for_isoform_ex$ ls output_data/
Agra_iso1.faa  Cass_iso1.faa  Dpon_iso1.faa  Madara_iso1.aa  Sory_iso1.faa  Tcas_iso1.faa
(fasp) kosukesano@at138:~/tools/for_isoform_ex$

アイソフォームを抜いた状態でのOrthoFinder

~/tools/for_orthofinder/241115_6sp_isoを作成、その下でedit.pyを作成し実行した。

### edit.pyの中身

import os
from Bio import SeqIO

# 入力ディレクトリと出力ディレクトリのパス
input_dir = '/home/kosukesano/tools/for_isoform_ex/output_data/'
output_dir = '../241115_6sp_iso/'

# 出力ディレクトリが存在しない場合は作成
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# 入力ディレクトリ内のすべての .faa または .aa ファイルを処理
for input_file in os.listdir(input_dir):
    if input_file.endswith(('.faa', '.aa')):
        input_path = os.path.join(input_dir, input_file)
        output_path = os.path.join(output_dir, input_file)

        # ファイル形式を設定
        format_type = 'fasta'  # Biopython では .faa も .aa も "fasta" 形式として扱う

        with open(output_path, 'w') as outfile:
            for record in SeqIO.parse(input_path, format_type):
                header = record.description
                seq = str(record.seq)
                new_header = ""  # 初期化

                # ヘッダーが「g」で始まる場合
                if header.startswith("g"):
                    number = header.split()[0]  # ヘッダーの最初の番号部分を取得
                    new_header = f">Smad_{number}"

                # ヘッダーが「]」で終わる場合
                elif header.endswith("]"):
                    within_brackets = header.split('[')[-1].split(']')[0]
                    first_letter = within_brackets[0]  # 最初の1文字
                    space_after = within_brackets.split()[-1][:3]  # スペース後の3文字
                    first_part = header.split()[0][1:]
                    new_header = f">{first_letter}{space_after}_{first_part}"

                # それ以外
                else:
                    new_header = f">{header.split()[0]}"

                # 新しいヘッダーと配列を出力ファイルに書き込む
                outfile.write(f"{new_header}\n{seq}\n")

        print(f"{output_path} に保存しました。")

前のedit.pyに改良を加えてるよ。

(fasp) kosukesano@at138:~/tools/for_orthofinder/241115_6sp_iso$ python edit.py 
../241115_6sp_iso/Cass_iso1.faa に保存しました。
../241115_6sp_iso/Sory_iso1.faa に保存しました。
../241115_6sp_iso/Dpon_iso1.faa に保存しました。
../241115_6sp_iso/Agra_iso1.faa に保存しました。
../241115_6sp_iso/Madara_iso1.aa に保存しました。
../241115_6sp_iso/Tcas_iso1.faa に保存しました。
(fasp) kosukesano@at138:~/tools/for_orthofinder/241115_6sp_iso$ ls
Agra_iso1.faa  Cass_iso1.faa  Dpon_iso1.faa  Madara_iso1.aa  Sory_iso1.faa  Tcas_iso1.faa  edit.py

エラーが怖かったので、一応拡張子を揃えておいた。

(fasp) kosukesano@at138:~/tools/for_orthofinder/241115_6sp_iso$ ls
Agra_iso1.faa  Cass_iso1.faa  Dpon_iso1.faa  Madara_iso1.aa  Sory_iso1.faa  Tcas_iso1.faa  edit.py
(fasp) kosukesano@at138:~/tools/for_orthofinder/241115_6sp_iso$ mv Madara_iso1.aa Smad_iso1.faa
(fasp) kosukesano@at138:~/tools/for_orthofinder/241115_6sp_iso$ ls
Agra_iso1.faa  Cass_iso1.faa  Dpon_iso1.faa  Smad_iso1.faa  Sory_iso1.faa  Tcas_iso1.faa  edit.py
(fasp) kosukesano@at138:~/tools/for_orthofinder/241115_6sp_iso$ 

ここで、orthofinder_241115.shを作成しqsubで投げた。

#$ -S /bin/bash
#$ -cwd
#$ -pe def_slot 16
#$ -l intel
echo start at
date


singularity exec /usr/local/biotools/o/orthofinder:2.5.4--hdfd78af_0 orthofinder\
        -f /home/kosukesano/tools/for_orthofinder/241115_6sp_iso\
        -t 16

date

1118

isoformを抜いた状態でのOrthofinder結果

Orthofinderはちゃんと動作して、結果も出力されてた。

(fasp) kosukesano@at138:~/tools/for_orthofinder/241115_6sp_iso$ ls
Agra_iso1.faa  Dpon_iso1.faa  Smad_iso1.faa  Tcas_iso1.faa  orthofinder_241115.sh            orthofinder_241115.sh.o27262834   orthofinder_241115.sh.po27262834
Cass_iso1.faa  OrthoFinder    Sory_iso1.faa  edit.py        orthofinder_241115.sh.e27262834  orthofinder_241115.sh.pe27262834
(fasp) kosukesano@at138:~/tools/for_orthofinder/241115_6sp_iso$ ls OrthoFinder/Results_Nov15/
Citation.txt                     Gene_Trees            Orthogroups                            Phylogenetically_Misplaced_Genes  Single_Copy_Orthologue_Sequences
Comparative_Genomics_Statistics  Log.txt               Orthologues                            Putative_Xenologs                 Species_Tree
Gene_Duplication_Events          Orthogroup_Sequences  Phylogenetic_Hierarchical_Orthogroups  Resolved_Gene_Trees               WorkingDirectory

Single Copy Orthologの数を見てみる

### Orthogroups_SingleCopyOrthologues.txtの末尾

   5100 OG0008578
   5101 OG0008579
   5102 OG0008580
   5103 OG0008581
   5104 OG0008582
   5105 OG0008583
   5106 OG0008584
   5107 OG0008585
   5108 OG0008586

5108個のSCOが取れた。増えた。

Dfamデータベースを使用したRepeatMasker

Dfamからトランスポゾンのデータを取得する。

その中でもColeopteraのみでフィルタリングしたデータを.fastaで取得、遺伝研のソフトマスク用生データディレクトリに転送した。 ``bash :~/Downloads$ scp /Users/kosukesano/Downloads/dfam-fasta-download.fasta kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_softmask/nama_data/Dfam_coleoptera.fasta Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4 +–[ED25519 256]–+ | ..o.o…* o+ | | . . ..= + o* o| | . = oB +.| | +.oo .+E+o.| | .*S. o.o.+| | .o. . . .+| | .. + . o| | . ..oo . | | . .=. | +—-[SHA256]—–+ dfam-fasta-download.fasta 100% 6397KB 42.5MB/s 00:00
:~/Downloads$

その中身はこんな感じ
```bash
kosukesano@at139:~/tools/for_softmask/nama_data$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat Dfam_coleoptera.fasta 
file                   format  type  num_seqs    sum_len  min_len  avg_len  max_len
Dfam_coleoptera.fasta  FASTA   DNA      3,358  6,329,269       98  1,884.8   20,756
kosukesano@at139:~/tools/for_softmask/nama_data$

~/tools/for_softmask/241118_Madara_softmaskディレクトリを作成、以下でconc.pyスクリプトを実行した。

### conc.pyの中身

import os

# 入力ファイルのパス
file1 = '/home/kosukesano/tools/for_softmask/nama_data/Dfam_coleoptera.fasta'
file2 = '/home/kosukesano/tools/for_softmask/nama_data/231117_madaragenome.fasta'

# 出力ディレクトリと出力ファイル名
output_dir = '/home/kosukesano/tools/for_softmask/241118_Madara_softmask'
output_file = os.path.join(output_dir, 'Madara_db.fasta')

# 出力ディレクトリが存在しない場合は作成
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# ファイルを結合して出力
with open(output_file, 'w') as outfile:
    for input_file in [file1, file2]:
        with open(input_file, 'r') as infile:
            outfile.write(infile.read())

print(f"結合されたファイルが {output_file} に保存されました。")

Dfamのデータベースとマダラのゲノムを結合するスクリプト。出力は/home/kosukesano/tools/for_softmask/241118_Madara_softmask/Madara_db.fasta

上記の出力を使ってソフトマスクを行う。EDTA環境を立ち上げた状態で以下を行う。

BLASTデータベースの作成

(EDTA2) kosukesano@at138:~/tools/for_softmask/241118_Madara_softmask$ BuildDatabase -name Madara_Dfam_DB Madara_db.fasta 
Building database Madara_Dfam_DB:
  Reading Madara_db.fasta...
Number of sequences (bp) added to database: 3567 ( 1301722634 bp )
(EDTA2) kosukesano@at138:~/tools/for_softmask/241118_Madara_softmask$ ls
Madara_Dfam_DB.nhr  Madara_Dfam_DB.nin  Madara_Dfam_DB.njs  Madara_Dfam_DB.nnd  Madara_Dfam_DB.nni  Madara_Dfam_DB.nog  Madara_Dfam_DB.nsq  Madara_Dfam_DB.translation  Madara_db.fasta  conc.py
(EDTA2) kosukesano@at138:~/tools/for_softmask/241118_Madara_softmask$ 

RepeatModelerの実行

### Madara_ReoeatModeler.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 24
#$ -l s_vmem=12G
#$ -l mem_req=12G
echo start at
date

source ~/tools/pyenv_env/EDTA_profile

RepeatModeler -database Madara_Dfam_DB -pa 6
date

これをqsubで投げた。

1119

重信先生のデータを用いたPANTHER

前に入れたやつが途中で止まってた。

ヤケクソでmedium24slot指定でブン投げてみる。

1120

Dfamデータベースを使用したRepeatMasker続き

RepeatModelerが終わっていたので、RepeatMaskerに移る。

以下のスクリプトを投げた。


#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 12
#$ -l s_vmem=12G
#$ -l mem_req=12G

echo start at
date

source /home/kosukesano/tools/pyenv_env/EDTA_profile

RepeatMasker -pa 10\
        -xsmall\
        -lib /home/kosukesano/tools/for_softmask/241118_Madara_softmask/RM_2232007.MonNov181355052024/consensi.fa.classified\
        /home/kosukesano/tools/for_softmask/nama_data/231117_madaragenome.fasta\
        -dir /home/kosukesano/tools/for_softmask/241118_Madara_softmask/output_dir


date

-dirコマンドで出力のディレクトリを指定している。

ちなみにRepeatModeler出力のconsensi.fa.classifledを見てみる

kosukesano@at138:~/tools/for_softmask/241118_Madara_softmask/RM_2232007.MonNov181355052024$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat consensi.fa.classified 
file                    format  type  num_seqs    sum_len  min_len  avg_len  max_len
consensi.fa.classified  FASTA   DNA      5,536  3,541,598       30    639.7   16,988
kosukesano@at138:~/tools/for_softmask/241118_Madara_softmask/RM_2232007.MonNov181355052024$ 

前のマダラゲノムのみで行なったやつはこれ。

kosukesano@at138:~/tools/for_softmask/RM_16988.WedMay221052072024$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat consensi.fa.classified 
file                    format  type  num_seqs    sum_len  min_len  avg_len  max_len
consensi.fa.classified  FASTA   DNA      5,527  3,531,164       30    638.9   11,934
kosukesano@at138:~/tools/for_softmask/RM_16988.WedMay221052072024$

あんまり変わってなくね?

このデータのマスキングが終わったので、BRAKERにかける

(EDTA2) kosukesano@at138:~/tools/for_softmask$ cp 241118_Madara_softmask/output_dir/231117_madaragenome.fasta.masked ~/tools/for_braker/nama_data/241120_madara_dfam.fasta
(EDTA2) kosukesano@at138:~/tools/for_softmask$ cd ~/tools/for_braker
(EDTA2) kosukesano@at138:~/tools/for_braker$ ls
241013_for_debag_madara  Dfro  Dval  Ekam  Femo  Femo_pilon  Kohuki  Kohuki_thread_one  Madara  Ojiro  OnlyProtein_femo  OnlyProtein_madara  Pstr  Sigenobu_Madara  nama_data
(EDTA2) kosukesano@at138:~/tools/for_braker$ mkdir 241120_madara_dfam
(EDTA2) kosukesano@at138:~/tools/for_braker$ cd 241120_madara_dfam/
(EDTA2) kosukesano@at138:~/tools/for_braker/241120_madara_dfam$ cp ../241013_for_debag_madara/madara_braker.sh ../241120_madara_dfam/
(EDTA2) kosukesano@at138:~/tools/for_braker/241120_madara_dfam$ ls
madara_braker.sh
(EDTA2) kosukesano@at138:~/tools/for_braker/241120_madara_dfam$ less madara_braker.sh 
(EDTA2) kosukesano@at138:~/tools/for_braker/241120_madara_dfam$ nano madara_braker.sh 
(EDTA2) kosukesano@at138:~/tools/for_braker/241120_madara_dfam$ qsub madara_braker.sh 
Your job 27278006 ("madara_braker.sh") has been submitted
(EDTA2) kosukesano@at138:~/tools/for_braker/241120_madara_dfam$ 

DfamRepeatMasker用のデータベースも併せて統合したものを使ってマスキング

kosukesano@at138:~/tools/for_softmask$ mkdir 241120_Madara_softmask

DfamのダウンロードサイトからDfam-RepeatMasker.lib.gzをダウンロードし遺伝研で解凍、それを用いてデータベースを作成し、RepeatModlerをかけた。

Dockerのインストール

Dockerをホームページからインストールし、以下を実行した。

:~/bio/for_RepeatMasker_Docker$ docker login -u k05uke54n0
Password: 
Login Succeeded
:~/bio/for_RepeatMasker_Docker$ docker pull dfam/tetools
Using default tag: latest
latest: Pulling from dfam/tetools
7600b3ee981a: Download complete 
7605118baa98: Download complete 
ffa0e8276bc9: Download complete 
d72ae4f33534: Download complete 
0616a07cf248: Download complete 
4f4fb700ef54: Download complete 
742e13a892ac: Download complete 
3d2705dcb843: Download complete 
dc8ad28c3cd1: Download complete 
54ae706075d5: Download complete 
44939d338867: Download complete 
47bf88a48c47: Download complete 
67af5d4f89bd: Download complete 
81791adf7c7a: Download complete 
8cd46d290033: Download complete 
bce2500fb467: Download complete 
12a7888856bd: Download complete 
429d893a0445: Download complete 
e59051f42299: Download complete 
cf2ab9e656d9: Download complete 
9cae1165f82b: Download complete 
ce6de47b44b5: Download complete 
513ea75e10b2: Download complete 
06658812daff: Download complete 
Digest: sha256:f60775010b4dfee18a92aea9191f66cd727d9764c5ba6142e03d3f7719604c28
Status: Downloaded newer image for dfam/tetools:latest
docker.io/dfam/tetools:latest
:~/bio/for_RepeatMasker_Docker$ docker container run -dit --mount type=bind,source="$PWD",target=/work --workdir /work --user "$(id -u):$(id -g)" --name dfamtet dfam/tetools
73f896bc2cd927361517e63d4fdc5242fe8b9287184e9326c4877881bd7aef94
:~/bio/for_RepeatMasker_Docker$ 

1125

Dfamデータベース(Beetleのみ)を使用したBRAKER

結果はちゃんと出力されていた。

kosukesano@at139:~/tools/for_braker/241120_madara_dfam/braker$ ls
Augustus  GeneMark-ETP  braker.aa  braker.codingseq  braker.gtf  braker.log  errors  genome_header.map  hintsfile.gff  species  what-to-cite.txt
kosukesano@at139:~/tools/for_braker/241120_madara_dfam/braker$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat braker.aa 
file       format  type     num_seqs    sum_len  min_len  avg_len  max_len
braker.aa  FASTA   Protein    16,972  8,851,473        4    521.5   20,186
kosukesano@at139:~/tools/for_braker/241120_madara_dfam/braker$ 

ちなみにこれまでのやつはこんな感じ

### マダラゲノム(RNA_seqデータ含)
kosukesano@at137:~/tools/for_braker/Madara/braker$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat braker.aa
file       format  type     num_seqs    sum_len  min_len  avg_len  max_len
braker.aa  FASTA   Protein    16,570  8,790,187        5    530.5   20,186
######################################################################

ちょっと遺伝子数が増えてるけど、劇的に変わっているわけではない?

Dfamデータベース(RepeatMasker用のデータベースも含む)を使用したRepeatModeler

kosukesano@at139:~/tools/for_softmask/241120_Madara_softmask$ ls RM_1157354.ThuNov211558022024/
consensi.fa  consensi.fa.classified  families-classified.stk  families.stk  round-1  round-2  round-3  round-4  round-5  round-6  tmpConsensi.fa
kosukesano@at139:~/tools/for_softmask/241120_Madara_softmask$ cp ../241118_Madara_softmask/Madara_RepeatMasker.sh
cp: missing destination file operand after '../241118_Madara_softmask/Madara_RepeatMasker.sh'
Try 'cp --help' for more information.
kosukesano@at139:~/tools/for_softmask/241120_Madara_softmask$ cp ../241118_Madara_softmask/Madara_RepeatMasker.sh ../241120_Madara_softmask/
kosukesano@at139:~/tools/for_softmask/241120_Madara_softmask$ ls
Madara_Dfam_DB-families.fa   Madara_Dfam_DB.nin  Madara_Dfam_DB.nni  Madara_Dfam_DB.translation  Madara_RepeatModeler.sh.e27277193   Madara_RepeatModeler.sh.po27277193  conc.py
Madara_Dfam_DB-families.stk  Madara_Dfam_DB.njs  Madara_Dfam_DB.nog  Madara_RepeatMasker.sh      Madara_RepeatModeler.sh.o27277193   Madara_db.fasta
Madara_Dfam_DB.nhr           Madara_Dfam_DB.nnd  Madara_Dfam_DB.nsq  Madara_RepeatModeler.sh     Madara_RepeatModeler.sh.pe27277193  RM_1157354.ThuNov211558022024
kosukesano@at139:~/tools/for_softmask/241120_Madara_softmask$ nano Madara_RepeatMasker.sh 
kosukesano@at139:~/tools/for_softmask/241120_Madara_softmask$ mkdir output_dir
kosukesano@at139:~/tools/for_softmask/241120_Madara_softmask$ nano Madara_RepeatMasker.sh 
kosukesano@at139:~/tools/for_softmask/241120_Madara_softmask$ qsub Madara_RepeatMasker.sh 
Your job 27290166 ("Madara_RepeatMasker.sh") has been submitted

qsubで投げた。

遺伝研環境でのRepeatMaskerの導入、およびDfamのデータベースを用いたマスキング

apptainerを用いて導入する。

kosukesano@at139:~/tools/for_RepeatMasker_Docker$ apptainer pull dfam-tetools_1.sif docker://dfam/tetools:1
INFO:    Converting OCI blobs to SIF format
INFO:    Starting build...
Getting image source signatures
Copying blob 742e13a892ac done   | 
Copying blob 7600b3ee981a done   | 
Copying blob 0616a07cf248 done   | 
Copying blob 8cd46d290033 done   | 
Copying blob 4f4fb700ef54 done   | 
Copying blob d72ae4f33534 done   | 
Copying blob dc8ad28c3cd1 done   | 
Copying blob 54ae706075d5 done   | 
Copying blob ce6de47b44b5 done   | 
Copying blob 47bf88a48c47 done   | 
Copying blob 9cae1165f82b done   | 
Copying blob bce2500fb467 done   | 
Copying blob 429d893a0445 done   | 
Copying blob 7605118baa98 done   | 
Copying blob ffa0e8276bc9 done   | 
Copying blob 3d2705dcb843 done   | 
Copying blob 44939d338867 done   | 
Copying blob 81791adf7c7a done   | 
Copying blob e59051f42299 done   | 
Copying blob cf2ab9e656d9 done   | 
Copying blob 06658812daff done   | 
Copying blob 12a7888856bd done   | 
Copying blob 513ea75e10b2 done   | 
Copying blob 67af5d4f89bd done   | 
Copying config da3bae2c6b done   | 
Writing manifest to image destination
2024/11/25 13:48:40  info unpack layer: sha256:8cd46d290033f265db57fd808ac81c444ec5a5b3f189c3d6d85043b647336913
2024/11/25 13:48:41  info unpack layer: sha256:7600b3ee981a7da30c6181a64e7a862ab42a7ef4eb5f4021770655123d90eaf4
2024/11/25 13:48:48  info unpack layer: sha256:d72ae4f33534d9e04d250010043e874741574cf74aaef61f4bbedcf4b27b6b5d
2024/11/25 13:48:52  info unpack layer: sha256:0616a07cf2481c7ce28b962bc2108052c0784c65602dd014ddb34bff4badf806
2024/11/25 13:48:52  info unpack layer: sha256:4f4fb700ef54461cfa02571ae0db9a0dc1e0cdb5577484a6d75e68dc38e8acc1
2024/11/25 13:48:52  info unpack layer: sha256:742e13a892ac834c62e9c0bd5d91e9798c932c99033ce82244afda4cf4f04314
2024/11/25 13:48:52  info unpack layer: sha256:dc8ad28c3cd175baccbdefca7b8e2c58c8a85a19c3686eb6af8d678dd0d32f8f
2024/11/25 13:48:57  info unpack layer: sha256:54ae706075d59d2da183faf12c8c4997eccadfa0fcfce4eee200fff3e72444b7
2024/11/25 13:48:57  info unpack layer: sha256:ce6de47b44b5618172427bf581e854e6f8a1b36c4e0b2408f1b5c1ce87440137
2024/11/25 13:48:57  info unpack layer: sha256:47bf88a48c4750f30f6cabf07d00bc730b16bab2f0211abac3861563a9a6bcb3
2024/11/25 13:48:57  info unpack layer: sha256:9cae1165f82bff3d4a918672c4ad0f4773cf51ece9de7d0201f2cde10ba2de85
2024/11/25 13:48:57  info unpack layer: sha256:bce2500fb46702c545e7f2a3c3f644085f353c81c866f49c5f8636cfb8bd365e
2024/11/25 13:48:58  info unpack layer: sha256:429d893a0445adeb93e1794c3a455f92e01cf4eecdb981059df9a83cf383d6e7
2024/11/25 13:48:59  info unpack layer: sha256:7605118baa98c83f22c50890d171c55949e2889fca04bcde16de350fe1aa38d3
2024/11/25 13:48:59  info unpack layer: sha256:ffa0e8276bc93f8cea95e70370fd76f73c5f0f2347b2b750556609aefc801d6e
2024/11/25 13:48:59  info unpack layer: sha256:3d2705dcb843a526a186434a876bc5fcdd8ff8d181877229968e364a987c62e1
2024/11/25 13:48:59  info unpack layer: sha256:44939d3388676d5eb2edbc758e3718b52941e38a38d75bde3ff27f2019ede7a7
2024/11/25 13:49:00  info unpack layer: sha256:81791adf7c7ad09fbd8b1eed958889ec4a957d08ee6ca97ed13df548d2671860
2024/11/25 13:49:00  info unpack layer: sha256:e59051f42299289bf4f74dbe3e22b715f1af42cc76baeece0ac7b3e25e6000f4
2024/11/25 13:49:00  info unpack layer: sha256:cf2ab9e656d9dbf726af2e2567e9291da2c96a453fbdf384558eb1b8a53407fa
2024/11/25 13:49:02  info unpack layer: sha256:06658812daff63fbc76ca6723fe76d8185a5fb4ccfed12fc44492bd57c43269f
2024/11/25 13:49:02  info unpack layer: sha256:12a7888856bdb1803b968e5bce63f3ae122cbffef3607eff3ff70498a67928b9
2024/11/25 13:49:02  info unpack layer: sha256:513ea75e10b2549874648c14dc87ae2113d3864707934096030f2e5805a23591
2024/11/25 13:49:02  info unpack layer: sha256:67af5d4f89bde4288ccddd26f2490c30ac03f41352c972bab703c6168fc0f064
INFO:    Creating SIF file...
kosukesano@at139:~/tools/for_RepeatMasker_Docker$ ls
dfam-tetools_1.sif
kosukesano@at139:~/tools/for_RepeatMasker_Docker$ 

これを使ったRepeatMaskerの実行。以下のスクリプトをqsubで投げた。

### madara_softmask.shの中身


#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 24
#$ -l s_vmem=12G
#$ -l mem_req=12G
echo start at
date

apptainer exec /home/kosukesano/tools/for_RepeatMasker_Docker/dfam-tetools_1.sif\
        RepeatMasker\
        -pa 6\
        -s\
        -lib /home/kosukesano/tools/for_RepeatMasker_Docker/nama_data/Dfam-RepeatMasker.lib\
        -dir /home/kosukesano/tools/for_RepeatMasker_Docker/241125_madara/output_dir\
        -xsmall\
        -gff\
        /home/kosukesano/tools/for_RepeatMasker_Docker/nama_data/231117_madaragenome.fasta


echo end at
date

1126

scorpionでのRepeatMasker環境の構築とマダラソフトマスクの実行

dendezia@scorpion:~/tool/for_RepeatMasker_Docker$ apptainer pull dfam-tetools_1.sif docker://dfam/tetools:1
INFO:    Converting OCI blobs to SIF format
INFO:    Starting build...
Copying blob 742e13a892ac done   | 
Copying blob 0616a07cf248 done   | 
Copying blob 4f4fb700ef54 done   | 
Copying blob d72ae4f33534 done   | 
Copying blob 8cd46d290033 done   | 
Copying blob 7600b3ee981a done   | 
Copying blob dc8ad28c3cd1 done   | 
Copying blob 54ae706075d5 done   | 
Copying blob ce6de47b44b5 done   | 
Copying blob 47bf88a48c47 done   | 
Copying blob 9cae1165f82b done   | 
Copying blob bce2500fb467 done   | 
Copying blob 429d893a0445 done   | 
Copying blob 7605118baa98 done   | 
Copying blob ffa0e8276bc9 done   | 
Copying blob 3d2705dcb843 done   | 
Copying blob 44939d338867 done   | 
Copying blob 81791adf7c7a done   | 
Copying blob e59051f42299 done   | 
Copying blob cf2ab9e656d9 done   | 
Copying blob 06658812daff done   | 
Copying blob 12a7888856bd done   | 
Copying blob 513ea75e10b2 done   | 
Copying blob 67af5d4f89bd done   | 
Copying config da3bae2c6b done   | 
Writing manifest to image destination
2024/11/26 10:43:03  info unpack layer: sha256:8cd46d290033f265db57fd808ac81c444ec5a5b3f189c3d6d85043b647336913
2024/11/26 10:43:06  info unpack layer: sha256:7600b3ee981a7da30c6181a64e7a862ab42a7ef4eb5f4021770655123d90eaf4
2024/11/26 10:43:20  info unpack layer: sha256:d72ae4f33534d9e04d250010043e874741574cf74aaef61f4bbedcf4b27b6b5d
2024/11/26 10:43:26  info unpack layer: sha256:0616a07cf2481c7ce28b962bc2108052c0784c65602dd014ddb34bff4badf806
2024/11/26 10:43:26  info unpack layer: sha256:4f4fb700ef54461cfa02571ae0db9a0dc1e0cdb5577484a6d75e68dc38e8acc1
2024/11/26 10:43:26  info unpack layer: sha256:742e13a892ac834c62e9c0bd5d91e9798c932c99033ce82244afda4cf4f04314
2024/11/26 10:43:26  info unpack layer: sha256:dc8ad28c3cd175baccbdefca7b8e2c58c8a85a19c3686eb6af8d678dd0d32f8f
2024/11/26 10:43:33  info unpack layer: sha256:54ae706075d59d2da183faf12c8c4997eccadfa0fcfce4eee200fff3e72444b7
2024/11/26 10:43:34  info unpack layer: sha256:ce6de47b44b5618172427bf581e854e6f8a1b36c4e0b2408f1b5c1ce87440137
2024/11/26 10:43:34  info unpack layer: sha256:47bf88a48c4750f30f6cabf07d00bc730b16bab2f0211abac3861563a9a6bcb3
2024/11/26 10:43:34  info unpack layer: sha256:9cae1165f82bff3d4a918672c4ad0f4773cf51ece9de7d0201f2cde10ba2de85
2024/11/26 10:43:34  info unpack layer: sha256:bce2500fb46702c545e7f2a3c3f644085f353c81c866f49c5f8636cfb8bd365e
2024/11/26 10:43:34  info unpack layer: sha256:429d893a0445adeb93e1794c3a455f92e01cf4eecdb981059df9a83cf383d6e7
2024/11/26 10:43:37  info unpack layer: sha256:7605118baa98c83f22c50890d171c55949e2889fca04bcde16de350fe1aa38d3
2024/11/26 10:43:38  info unpack layer: sha256:ffa0e8276bc93f8cea95e70370fd76f73c5f0f2347b2b750556609aefc801d6e
2024/11/26 10:43:38  info unpack layer: sha256:3d2705dcb843a526a186434a876bc5fcdd8ff8d181877229968e364a987c62e1
2024/11/26 10:43:39  info unpack layer: sha256:44939d3388676d5eb2edbc758e3718b52941e38a38d75bde3ff27f2019ede7a7
2024/11/26 10:43:39  info unpack layer: sha256:81791adf7c7ad09fbd8b1eed958889ec4a957d08ee6ca97ed13df548d2671860
2024/11/26 10:43:39  info unpack layer: sha256:e59051f42299289bf4f74dbe3e22b715f1af42cc76baeece0ac7b3e25e6000f4
2024/11/26 10:43:39  info unpack layer: sha256:cf2ab9e656d9dbf726af2e2567e9291da2c96a453fbdf384558eb1b8a53407fa
2024/11/26 10:43:42  info unpack layer: sha256:06658812daff63fbc76ca6723fe76d8185a5fb4ccfed12fc44492bd57c43269f
2024/11/26 10:43:42  info unpack layer: sha256:12a7888856bdb1803b968e5bce63f3ae122cbffef3607eff3ff70498a67928b9
2024/11/26 10:43:42  info unpack layer: sha256:513ea75e10b2549874648c14dc87ae2113d3864707934096030f2e5805a23591
2024/11/26 10:43:42  info unpack layer: sha256:67af5d4f89bde4288ccddd26f2490c30ac03f41352c972bab703c6168fc0f064
INFO:    Creating SIF file...
dendezia@scorpion:~~/tool/for_RepeatMasker_Docker$ ls
dfam-tetools_1.sif
dendezia@scorpion:~/tool/for_RepeatMasker_Docker$ 

ここで/241126_madara/madara_RepeatMasker.shを作成、実行した。

### madara_RepeatMasker.shの中身


#$ -S /bin/bash
#$ -cwd

echo start at
date

apptainer exec /home/dendezia/tool/for_RepeatMasker_Docker/dfam-tetools_1.sif\
        RepeatMasker\
        -pa 6\
        -s\
        -lib /home/dendezia/tool/for_RepeatMasker_Docker/nama_data/Dfam-RepeatMasker.lib\
        -dir /home/dendezia/tool/for_RepeatMasker_Docker/241126_madara/output_dir\
        -xsmall\
        -gff\
        /home/dendezia/tool/for_RepeatMasker_Docker/nama_data/231117_madaragenome.fasta


echo end at
date

EDTARepeatMaskerでも同じことってできるのかな? #### scorpionでのEDTAを使ったRepeatMaskerの実行


#$ -S /bin/bash
#$ -cwd

echo start at
date


cd /home/dendezia/tool/for_softmask/241126_madara_EDTA/

source /home/dendezia/tool/pyenv_env/EDTA_profile

RepeatMasker\
        -pa 6\
        -s\
        -lib /home/dendezia/tool/for_RepeatMasker_Docker/nama_data/Dfam-RepeatMasker.lib\
        -dir /home/dendezia/tool/for_softmask/241126_madara_EDTA/output_dir\
        -xsmall\
        -gff\
        /home/dendezia/tool/for_RepeatMasker_Docker/nama_data/231117_madaragenome.fasta


echo end at
date

これをqsubで投げた。o

1127

Docker使用のRepeatMasker産物などを使ったBRAKER

現在、

  • DfamRepeatMasker用データをEDTABuildDataBaseでデータベース化し、EDTA内のRepeatMaskerでマスキングしたマダラケシツブゾウムシのデータ
  • DfamRepeatMasker用データを-libでそのまま指定し、EDTA内のRepeatMaskerでマスキングしたマダラケシツブゾウムシのデータ
  • DfamRepeatMasker用データを-libでそのまま指定し、DockerRepeatMaskerでマスキングしたマダラケシツブゾウムシのデータ

の3パターンがある。

それぞれ~/tools/for_braker/nama_data

  • 241127_madara_dfam_RMdata_buildDB.fasta
  • 241127_madara_dfam_RM_data_NotUsedBuildDB.fasta
  • 241127_madara_DockerRM.fasta としてマスキングデータを保存。これを使ってBRAKERをかける。

~/tools/for_braker/241127_madaraを作成、その下でそれぞれのデータごとにディレクトリを分けて解析を行う。

kosukesano@at139:~/tools/for_braker/241127_madara$ ls
DockerRM  dfam_RM_data_NotUsedBuildDB  dfam_RMdata_buildDB
kosukesano@at139:~/tools/for_braker/241127_madara$

各ディレクトリでmadara_braker.shを作成、qsubで投げた。

#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 16

echo start at
date

source /home/kosukesano/tools/pyenv_env/braker_profile

braker.pl --genome=/home/kosukesano/tools/for_braker/nama_data/241127_madara_DockerRM.fasta\
        --prot_seq=/home/kosukesano/tools/Arthropoda.fa\
        --rnaseq_sets_ids=adult-1_1,adult-1_2,adult-2_1,adult-2_2,adult-3_1,adult-3_2,\
        body-1_1,body-1_2,body-2_1,body-2_2,body-3_1,body-3_2,\
        large-larva-1_1,large-larva-1_2,large-larva-2_1,large-larva-2_2,large-larva-3_1,large-larva-3_2,\
        middle-larva-1_1,middle-larva-1_2,middle-larva-2_1,middle-larva-2_2,middle-larva-3_1,middle-larva-3_2,\
        ovary-1_1,ovary-1_2,ovary-2_1,ovary-2_2,ovary-3_1,ovary-3_2 \
        --rnaseq_sets_dir=/home/kosukesano/tools/for_braker/nama_data/Madara_RNAseq\
        --threads=16\
        --species=Smadaranus_241127_DockerRM\
        --AUGUSTUS_CONFIG_PATH=/usr/share/augustus/config\
        --AUGUSTUS_BIN_PATH=/usr/bin\
        --AUGUSTUS_SCRIPTS_PATH=/usr/share/augustus/scripts\
        --GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin\
        --PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin\
        --TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin

echo end at
date

メモ・RepeatMaskerのバージョンの確認

dockerで入れたRepeatMaskerのバージョンを見てみた。

RepeatMasker version 4.1.7-p1
Unknown option: version
/opt/RepeatMasker/RepeatMasker - 4.1.7-p1

1128

DfamライブラリとRepeatModeler出力のconsensi.fa.classifiedを結合させたファイルを-libに指定したRepeatMasker

RepeatModeler単体だとライブラリを参照してマスキングをかけるだけ、そこに入力するライブラリの種類が様々ある。RepeatModelerはマスクしたいゲノムからde novoでライブラリを作ってくれるらしい。またDfamには様々な生物のトランスポゾンをまとめたライブラリが存在する。

これ2つのライブラリを結合して、それをインプットにすればいいのでは?

マダラケシツブゾウムシのゲノムを元に作ったライブラリ、~/tools/for_softmask/RM_16988.WedMay221052072024/consensi.fa.classifiedDfamDfam-RepeatMasker.libを結合する。結合したファイルは241128_for_madara.libとして出力。

kosukesano@at138:~/tools/for_RepeatMasker_Docker/nama_data$ cat Dfam-RepeatMasker.lib ~/tools/for_softmask/RM_16988.WedMay221052072024/consensi.fa.classified > 241128_for_madara.lib
kosukesano@at138:~/tools/for_RepeatMasker_Docker/nama_data$ ls
231117_madaragenome.fasta  241128_for_madara.lib  Dfam-RepeatMasker.lib
kosukesano@at138:~/tools/for_RepeatMasker_Docker/nama_data$

これを元にしてRepeatMaskerをかける。~/tools/for_RepeatMasker_Docker/241128_madaraを作成し、その下でmadara_softmask.shを実行。

#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 24
#$ -l s_vmem=12G
#$ -l mem_req=12G

echo start at
date

apptainer exec /home/kosukesano/tools/for_RepeatMasker_Docker/dfam-tetools_1.sif\
        RepeatMasker\
        -pa 6\
        -s\
        -lib /home/kosukesano/tools/for_RepeatMasker_Docker/nama_data/241128_for_madara.lib\
        -dir /home/kosukesano/tools/for_RepeatMasker_Docker/241128_madara/output_dir\
        -xsmall\
        -gff\
        /home/kosukesano/tools/for_RepeatMasker_Docker/nama_data/231117_madaragenome.fasta


echo end at
date

実行に時間がかかりそうだったから、scorpionにも241128_for_madara.libを送って同じことをする。

1129

DfamライブラリとRepeatModeler出力のconsensi.fa.classifiedを結合させたファイルを-libに指定したRepeatMasker結果・それを用いたBRAKER

kosukesano@at137:~/tools/for_RepeatMasker_Docker/241128_madara$ ls
madara_softmask.sh  madara_softmask.sh.e27301361  madara_softmask.sh.o27301361  madara_softmask.sh.pe27301361  madara_softmask.sh.po27301361  output_dir
kosukesano@at137:~/tools/for_RepeatMasker_Docker/241128_madara$ ls output_dir/
231117_madaragenome.fasta.cat.gz  231117_madaragenome.fasta.masked  231117_madaragenome.fasta.out  231117_madaragenome.fasta.out.gff  231117_madaragenome.fasta.tbl
kosukesano@at137:~/tools/for_RepeatMasker_Docker/241128_madara$

ちゃんと出力されてた。

これを241129_madara_dfamplusbuilddb.fastaとしてコピー。

kosukesano@at137:~/tools/for_RepeatMasker_Docker/241128_madara$ cp output_dir/231117_madaragenome.fasta.masked ~/tools/for_braker/nama_data/241129_madara_dfamplusbuilddb.fasta
kosukesano@at137:~/tools/for_RepeatMasker_Docker/241128_madara$ 

これを使ってBRAKERを実行。

kosukesano@at137:~/tools/for_RepeatMasker_Docker/241128_madara$ cp output_dir/231117_madaragenome.fasta.masked ~/tools/for_braker/nama_data/241129_madara_dfamplusbuilddb.fasta
kosukesano@at137:~/tools/for_RepeatMasker_Docker/241128_madara$ 

~/tools/for_braker/241129_madaraディレクトリを作成、その下でmadara_braker.shを書き実行した。

### madara_braker.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 16

echo start at
date

source /home/kosukesano/tools/pyenv_env/braker_profile

braker.pl --genome=/home/kosukesano/tools/for_braker/nama_data/241129_madara_dfamplusbuilddb.fasta\
        --prot_seq=/home/kosukesano/tools/Arthropoda.fa\
        --rnaseq_sets_ids=adult-1_1,adult-1_2,adult-2_1,adult-2_2,adult-3_1,adult-3_2,\
        body-1_1,body-1_2,body-2_1,body-2_2,body-3_1,body-3_2,\
        large-larva-1_1,large-larva-1_2,large-larva-2_1,large-larva-2_2,large-larva-3_1,large-larva-3_2,\
        middle-larva-1_1,middle-larva-1_2,middle-larva-2_1,middle-larva-2_2,middle-larva-3_1,middle-larva-3_2,\
        ovary-1_1,ovary-1_2,ovary-2_1,ovary-2_2,ovary-3_1,ovary-3_2 \
        --rnaseq_sets_dir=/home/kosukesano/tools/for_braker/nama_data/Madara_RNAseq\
        --threads=16\
        --species=Smadaranus_241129_DockerRM\
        --AUGUSTUS_CONFIG_PATH=/usr/share/augustus/config\
        --AUGUSTUS_BIN_PATH=/usr/bin\
        --AUGUSTUS_SCRIPTS_PATH=/usr/share/augustus/scripts\
        --GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin\
        --PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin\
        --TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin

echo end at
date

2024年12月

1202

各マスキングの比較

  • マダラのゲノムデータをEDTABuildDataBaseでデータベース化し、EDTA内のRepeatMaskerでマスキングしたマダラケシツブゾウムシのデータ(元のやつ)
kosukesano@at137:~/tools/for_braker/Madara/braker$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat braker.aa
file       format  type     num_seqs    sum_len  min_len  avg_len  max_len
braker.aa  FASTA   Protein    16,570  8,790,187        5    530.5   20,186
######################################################################
  • 重信先生アノテーションのデータ
kosukesano@at138:~/tools/for_braker/Sigenobu_Madara$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat braker.SmiMad_GM1.gff.aa.fasta 
file                            format  type     num_seqs    sum_len  min_len  avg_len  max_len
braker.SmiMad_GM1.gff.aa.fasta  FASTA   Protein    18,048  9,405,353        2    521.1   20,594
kosukesano@at138:~/tools/for_braker/Sigenobu_Madara$
######################################################################
  • Dfamの甲虫トランスポゾンデータをマダラのゲノムデータと結合し、EDTABuildDataBaseでデータベース化し、EDTA内のRepeatMaskerでマスキングしたマダラケシツブゾウムシのデータ
kosukesano@at138:~/tools/for_braker/241120_madara_dfam/braker$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat braker.aa
file       format  type     num_seqs    sum_len  min_len  avg_len  max_len
braker.aa  FASTA   Protein    16,972  8,851,473        4    521.5   20,186
kosukesano@at138:~/tools/for_braker/241120_madara_dfam/braker$
######################################################################
  • DfamRepeatMasker用データをEDTABuildDataBaseでデータベース化し、EDTA内のRepeatMaskerでマスキングしたマダラケシツブゾウムシのデータ
kosukesano@at138:~/tools/for_braker/241127_madara/dfam_RMdata_buildDB/braker$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat braker.aa
file       format  type     num_seqs    sum_len  min_len  avg_len  max_len
braker.aa  FASTA   Protein    17,600  9,052,417        5    514.3   20,186
kosukesano@at138:~/tools/for_braker/241127_madara/dfam_RMdata_buildDB/braker$
######################################################################
  • DfamRepeatMasker用データを-libでそのまま指定し、EDTA内のRepeatMaskerでマスキングしたマダラケシツブゾウムシのデータ
kosukesano@at138:~/tools/for_braker/241127_madara/dfam_RM_data_NotUsedBuildDB/braker$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat braker.aa
file       format  type     num_seqs    sum_len  min_len  avg_len  max_len
braker.aa  FASTA   Protein    18,546  9,073,620        5    489.2   18,391
kosukesano@at138:~/tools/for_braker/241127_madara/dfam_RM_data_NotUsedBuildDB/braker$ 
#######################################################################
  • DfamRepeatMasker用データを-libでそのまま指定し、DockerRepeatMaskerでマスキングしたマダラケシツブゾウムシのデータ
kosukesano@at138:~/tools/for_braker/241127_madara/DockerRM/braker$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat braker.aa
file       format  type     num_seqs    sum_len  min_len  avg_len  max_len
braker.aa  FASTA   Protein    18,341  8,915,822        5    486.1    8,823
kosukesano@at138:~/tools/for_braker/241127_madara/DockerRM/braker$ 
#######################################################################
  • DfamRepeatMasker用データとマダラのゲノムデータをEDTABuildDataBaseでデータベース化したもの結合させ、-libに指定し、DockerRepeatMaskerでマスキングしたマダラケシツブゾウムシのデータ
kosukesano@at138:~/tools/for_braker/241129_madara/braker$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat braker.aa
file       format  type     num_seqs    sum_len  min_len  avg_len  max_len
braker.aa  FASTA   Protein    15,150  8,400,426        5    554.5   20,186
kosukesano@at138:~/tools/for_braker/241129_madara/braker$ 
#######################################################################

ちなみにEDTA内のRepeatMaskerのバージョンはこう。

(EDTA2) kosukesano@at138:~/tools/for_braker/Sigenobu_Madara$ RepeatMasker
RepeatMasker version 4.1.2-p1
No query sequence file indicated

/lustre7/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/envs/EDTA2/bin/RepeatMasker - 4.1.2-p1
NAME
    RepeatMasker - Mask repetitive DNA

DfamRepeatMasker用データとマダラのゲノムデータをEDTABuildDataBaseでデータベース化したもの結合させ、-libに指定し、DockerRepeatMaskerでマスキングしたマダラケシツブゾウムシのデータ」からアイソフォームを抜く

source ~/tools/for_isoform_ex/fasp/bin/activatefasp環境を立ち上げておく。

その後、~/tools/for_braker/241129_madara以下でExIsoform.pyを作成、実行。

### ExIsoform.pyの中身


#!/usr/bin/env python3

"""Library for processing protein FASTA files.

Functions
---------
exclude_isoforms_by_length: Exclude isoforms based on length.
exclude_non_nuclear_proteins: Exclude mitochondrial and chloroplast proteins.

"""

from Bio import SeqIO

def exclude_isoforms_by_length(input_filename: str, output_filename: str, gff3_file: str) -> None:
    """Exclude isoforms based on length.
    
    Args
    ----
    input_filename : str
        Input protein FASTA filename.
    output_filename : str
        Output protein FASTA filename.
    gff3_file : str
        Input genome GFF3 filename.

    """
    
    def parse_gff3(gff3_file: str) -> dict:
        """Parse GFF3 file and make dict with 'protein_id', 'start', 'end' and 'length' of each gene. 

        Args
        ----
        gff3_file : str

        Returns
        -------
        genes : dict
            Dict with 'protein_id', 'start', 'end' and 'length' of each genes. 

        """
        genes = {}
        with open(gff3_file, "r") as gff3_handle:
            for line in gff3_handle:
                ## Exclude comments
                if line.startswith("#"):
                    continue

                li = line.strip().split("\t")
                if len(li) != 9:
                    continue
                ptg_num, tools, kind, start, end, score, strand, phase, attributes = li

                ## Exclude lines other than CDS.
                if kind != "CDS":
                    continue

                ## Handle attributes.
                attr_dict = {}
                for attr in attributes.split("; "):
                    key_value = attr.split(" ")
                    ##print(len(key_value))
                    if len(key_value) != 2:
                        continue
                    key, value = key_value
                    attr_dict[key] = value
                
                ## Exclude CDS without protein_id.
                if "transcript_id" not in attr_dict:
                    print("trans")
                    continue

                ## Exclude CDS without protein_id.
                if "gene_id" not in attr_dict:
                    print("gene_ID")
                    continue

                ## Read information of CDS.
                protein_id = attr_dict["transcript_id"]
                protein_id = protein_id.strip('"')
                print(protein_id)
                if "gene_id" in attr_dict:
                    gene = attr_dict["gene_id"]
                start, end, length = int(start), int(end), int(end) - int(start)
                if gene not in genes:
                    genes[gene] = []
                genes[gene].append({"protein_id": protein_id, "start": start, "end": end, "length": length})

        return genes

    def select_longest_protein(genes: dict) -> dict:
        """Select the longest proteins for each gene based on CDS information.

        Args
        ----
        genes : dict

        Returns
        -------
        longest_proteins : dict

        """
        longest_proteins = {}
        for gene, cds_list in genes.items():
            protein_lengths = {}
            for cds in cds_list:
                protein_id = cds["protein_id"]
                length = cds["length"]
                if protein_id not in protein_lengths:
                    protein_lengths[protein_id] = length
                else:
                    protein_lengths[protein_id] += length
            ## Select the longest protein.
            longest_proteins[gene] = max(protein_lengths, key=protein_lengths.get)

        return longest_proteins

    def slice_proteins(input_filename: str, output_filename: str, longest_proteins: dict) -> None:
        """Slice FASTA file to retain only the longest proteins for each gene.

        Args
        ----
        input_filename : str
        output_filename : str
        longest_proteins : dict

        """
        input_proteins = SeqIO.to_dict(SeqIO.parse(input_filename, "fasta"))
        selected_protein_ids = set(longest_proteins.values())

        output_proteins = []
        for selected_protein_id in selected_protein_ids:
            if selected_protein_id not in input_proteins:
                continue
            output_proteins.append(input_proteins[selected_protein_id])
        
        with open(output_filename, "w") as output_handle:
            SeqIO.write(output_proteins, output_handle, "fasta")

    genes = parse_gff3(gff3_file)
    longest_proteins = select_longest_protein(genes)
    slice_proteins(input_filename, output_filename, longest_proteins)


exclude_isoforms_by_length("braker/braker.aa", "241129_madara_iso1.aa", "braker/braker.gtf")

これで、アイソフォームを抜いた241129_madara_iso1.aaができた。

(fasp) (EDTA2) kosukesano@at138:~/tools/for_braker/241129_madara$  singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat 241129_madara_iso1.aa 
file                   format  type     num_seqs    sum_len  min_len  avg_len  max_len
241129_madara_iso1.aa  FASTA   Protein    12,337  6,131,098        5      497   20,186
(fasp) (EDTA2) kosukesano@at138:~/tools/for_braker/241129_madara$

DfamRepeatMasker用データとマダラのゲノムデータをEDTABuildDataBaseでデータベース化したもの結合させ、-libに指定し、DockerRepeatMaskerでマスキングしたマダラケシツブゾウムシのデータ」のBUSCO

~/tools/for_braker/241129_madaraにて以下のスクリプトを作成し、実行した。

#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 2
echo start at
date


date
singularity exec -e /usr/local/biotools/b/busco:5.1.3--pyhdfd78af_0 busco\
        -m protein\
        -i /home/kosukesano/tools/for_braker/241129_madara/braker/braker.aa\
        -o BUSCO_OUTPUT_MADARA\
        -l\
        /home/kosukesano/old_envilonment_until20240430/busco_downloads/busco_downloads/lineages/arthropoda_odb10/\
        -f

ちなみに作業ノードで回したらこうなった。

(fasp) (EDTA2) kosukesano@at138:~/tools/for_braker/241129_madara$ singularity exec -e /usr/local/biotools/b/busco:5.1.3--pyhdfd78af_0 busco -m protein -i /home/kosukesano/tools/for_braker/241129_madara/braker/braker.aa -o BUSCO_OUTPUT_MADARA -l /home/kosukesano/old_envilonment_until20240430/busco_downloads/busco_downloads/lineages/arthropoda_odb10/ -f
INFO:   ***** Start a BUSCO v5.1.3 analysis, current time: 12/02/2024 16:31:37 *****
INFO:   Configuring BUSCO with local environment
INFO:   Mode is proteins
INFO:   'Force' option selected; overwriting previous results directory
INFO:   Downloading information on latest versions of BUSCO data...
INFO:   Input file is /home/kosukesano/tools/for_braker/241129_madara/braker/braker.aa
INFO:   Using local lineages directory /home/kosukesano/old_envilonment_until20240430/busco_downloads/busco_downloads/lineages/arthropoda_odb10/
INFO:   Running BUSCO using lineage dataset  (eukaryota, 2024-01-08)
INFO:   ***** Run HMMER on gene sequences *****
INFO:   Running 1013 job(s) on hmmsearch, starting at 12/02/2024 16:31:39
INFO:   [hmmsearch]     102 of 1013 task(s) completed
INFO:   [hmmsearch]     203 of 1013 task(s) completed
INFO:   [hmmsearch]     304 of 1013 task(s) completed
INFO:   [hmmsearch]     406 of 1013 task(s) completed
INFO:   [hmmsearch]     507 of 1013 task(s) completed
INFO:   [hmmsearch]     608 of 1013 task(s) completed
INFO:   [hmmsearch]     710 of 1013 task(s) completed
INFO:   [hmmsearch]     811 of 1013 task(s) completed
INFO:   [hmmsearch]     912 of 1013 task(s) completed
INFO:   [hmmsearch]     1013 of 1013 task(s) completed
INFO:

        --------------------------------------------------
        |Results from dataset                             |
        --------------------------------------------------
        |C:96.6%[S:83.4%,D:13.2%],F:0.7%,M:2.7%,n:1013    |
        |979    Complete BUSCOs (C)                       |
        |845    Complete and single-copy BUSCOs (S)       |
        |134    Complete and duplicated BUSCOs (D)        |
        |7      Fragmented BUSCOs (F)                     |
        |27     Missing BUSCOs (M)                        |
        |1013   Total BUSCO groups searched               |
        --------------------------------------------------
INFO:   BUSCO analysis done. Total running time: 359 seconds
INFO:   Results written in /home/kosukesano/tools/for_braker/241129_madara/BUSCO_OUTPUT_MADARA
INFO:   For assistance with interpreting the results, please consult the userguide: https://busco.ezlab.org/busco_userguide.html

(fasp) (EDTA2) kosukesano@at138:~/tools/for_braker/241129_madara$

1203

GeMoMaの導入

scorpionで実行した。

まず、EDTA環境を立ち上げてmambaを起動する。この状態でEDTA環境から抜けるとpyenvmambaが使える状態でbase環境に入れる。

dendezia@scorpion:~/tool/pyenv_env$ source EDTA_profile 
(EDTA2) dendezia@scorpion:~/tool/pyenv_env$ conda deactivate
(base) dendezia@scorpion:~/tool/pyenv_env$ 

次に、gemoma環境を作る。

(base) dendezia@scorpion:~/tool/pyenv_env$ mamba create -n gemoma -y

                  __    __    __    __
                 /  \  /  \  /  \  /  \
                /    \/    \/    \/    \
███████████████/  /██/  /██/  /██/  /████████████████████████
              /  / \   / \   / \   / \  \____
             /  /   \_/   \_/   \_/   \    o \__,
            / _/                       \_____/  `
            |/
        ███╗   ███╗ █████╗ ███╗   ███╗██████╗  █████╗
        ████╗ ████║██╔══██╗████╗ ████║██╔══██╗██╔══██╗
        ██╔████╔██║███████║██╔████╔██║██████╔╝███████║
        ██║╚██╔╝██║██╔══██║██║╚██╔╝██║██╔══██╗██╔══██║
        ██║ ╚═╝ ██║██║  ██║██║ ╚═╝ ██║██████╔╝██║  ██║
        ╚═╝     ╚═╝╚═╝  ╚═╝╚═╝     ╚═╝╚═════╝ ╚═╝  ╚═╝

        mamba (1.1.0) supported by @QuantStack

        GitHub:  https://github.com/mamba-org/mamba
        Twitter: https://twitter.com/QuantStack

█████████████████████████████████████████████████████████████


Looking for: []

Preparing transaction: done
Verifying transaction: done
Executing transaction: done

To activate this environment, use

     $ mamba activate gemoma

To deactivate an active environment, use

     $ mamba deactivate

(base) dendezia@scorpion:~/tool/pyenv_env$ mamba activate gemoma
(gemoma) dendezia@scorpion:~/tool/pyenv_env$

空のgemoma環境を立ち上げる。

その中で、GeMoMaをインストールする。

(gemoma) dendezia@scorpion:~/tool/pyenv_env$ mamba install -c conda-forge -c bioconda gemoma=1.9 -y

                  __    __    __    __
                 /  \  /  \  /  \  /  \
                /    \/    \/    \/    \
███████████████/  /██/  /██/  /██/  /████████████████████████
              /  / \   / \   / \   / \  \____
             /  /   \_/   \_/   \_/   \    o \__,
            / _/                       \_____/  `
            |/
        ███╗   ███╗ █████╗ ███╗   ███╗██████╗  █████╗
        ████╗ ████║██╔══██╗████╗ ████║██╔══██╗██╔══██╗
        ██╔████╔██║███████║██╔████╔██║██████╔╝███████║
        ██║╚██╔╝██║██╔══██║██║╚██╔╝██║██╔══██╗██╔══██║
        ██║ ╚═╝ ██║██║  ██║██║ ╚═╝ ██║██████╔╝██║  ██║
        ╚═╝     ╚═╝╚═╝  ╚═╝╚═╝     ╚═╝╚═════╝ ╚═╝  ╚═╝

        mamba (1.1.0) supported by @QuantStack

        GitHub:  https://github.com/mamba-org/mamba
        Twitter: https://twitter.com/QuantStack

█████████████████████████████████████████████████████████████


Looking for: ['gemoma=1.9']

bioconda/noarch                                      5.3MB @   4.8MB/s  1.2s
bioconda/linux-64                                    5.7MB @   4.4MB/s  1.3s
conda-forge/noarch                                  20.3MB @   7.0MB/s  3.1s
conda-forge/linux-64                                47.2MB @   7.9MB/s  6.7s
Transaction

  Prefix: /home/dendezia/.pyenv/versions/mambaforge-22.9.0-3/envs/gemoma

  Updating specs:

   - gemoma=1.9

.
.
.
.
.

これでGeMoMaのインストールは完了。

ヘルプを出してみる。

(gemoma) dendezia@scorpion:~/tool/pyenv_env$ GeMoMa -h
Searching for the new GeMoMa updates ...
You are using the latest GeMoMa version.

This jar allows to run all parts of GeneModelMapper (GeMoMa) except the external search algorithm (e.g. tblastn).


For more information please visit http://www.jstacs.de/index.php/GeMoMa
If you have any questions, comments or bugs, please check FAQs on our homepage, our github page https://github.com/Jstacs/Jstacs/labels/GeMoMa or contact jens.keilwagen@julius-kuehn.de

If you use this tool, please cite

@article{Keilwagen:2016:GeMoMa,
 author = {Keilwagen, Jens and Wenk, Michael and Erickson, Jessica L. and Schattat, Martin H. and Grau, Jan and Hartung, Frank},
 title = {{Using intron position conservation for homology-based gene prediction}},
 journal = {Nucleic Acids Research},
 volume = {44},
 number = {9},
 pages = {e89-e89},
 year = {2016},
 month = {02},
 issn = {0305-1048},
 doi = {10.1093/nar/gkw092}
}

@article{Keilwagen:2018:GeMoMa_RNAseq,
 author = {Keilwagen, Jens and Hartung, Frank and Paulini, Michael and Twardziok, Sven O. and Grau, Jan},
 title = {Combining RNA-seq data and homology-based gene prediction for plants, animals and fungi},
 journal = {BMC Bioinformatics},
 year = {2018},
 month = {May},
 day = {30},
 volume = {19},
 number = {1},
 pages = {189},
 issn = {1471-2105},
 doi = {10.1186/s12859-018-2203-5}
}
.
.
.
.
.
.

いけてそう。

環境立ち上げプロファイルを作成、~/tool/pyenv_envgemoma_profileを書いた。

### gemoma_profileの中身

source ~/.bash_profile
source ~/pyenv_conda_environment/.pyenv_profile
pyenv global mambaforge-22.9.0-3



# >>> conda initialize >>>
# !! Contents within this block are managed by 'conda init' !!
__conda_setup="$('/home/dendezia/.pyenv/versions/mambaforge-22.9.0-3/bin/conda' 'shell.bash' 'hook' 2> /dev/null)"
if [ $? -eq 0 ]; then
    eval "$__conda_setup"
else
    if [ -f "/home/dendezia/.pyenv/versions/mambaforge-22.9.0-3//etc/profile.d/conda.sh" ]; then
        . "/home/dendezia/.pyenv/versions/mambaforge-22.9.0-3/etc/profile.d/conda.sh"
    else
        export PATH="/home/dendezia/.pyenv/versions/mambaforge-22.9.0-3/bin:$PATH"
    fi
fi
unset __conda_setup
if [ -f "/home/dendezia/.pyenv/versions/mambaforge-22.9.0-3/etc/profile.d/mamba.sh" ]; then
    . "/home/dendezia/.pyenv/versions/mambaforge-22.9.0-3/etc/profile.d/mamba.sh"
fi
# <<< conda initialize <<<

conda activate gemoma

これを遺伝研でもやった。

遺伝研にて、~/tools/for_gemoma/241203_testを作成、以下のスクリプトを実行した。

### madara_gemoma.shの中身


#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 8
#$ -l s_vmem=12G
#$ -l mem_req=12G

echo start at
date

source /home/kosukesano/tools/pyenv_env/gemoma_profile

GeMoMa GeMoMaPipeline \
        t=/home/kosukesano/tools/for_softmask/nama_data/231117_madaragenome.fasta \
        r=NO \
        o=true \
        i=Tcas \
        a=/home/kosukesano/tools/for_gemoma/nama_data/reference/Tcas/Tcas_genomic.gff \
        g=/home/kosukesano/tools/for_gemoma/nama_data/reference/Tcas/Tcas.fna \
        GeMoMa.Score=ReAlign \
        AnnotationFinalizer.r=NO \
        threads=8 \
        outdir=/home/kosukesano/tools/for_gemoma/241203_test/madara_out

echo end at
date

マダラのRNA-seqのBAMファイル

240430_ddbj_backup       240529_RNAseq            kosukesano_oldPC         old_file                 山昆生データ
240514_new_weebil_genome 240705                   merged_madara.zip        sano
:/Volumes/Elements_1$ pwd
/Volumes/Elements_1
:/Volumes/Elements_1$ pwd
/Volumes/Elements_1
:/Volumes/Elements_1$ cd
:~$ scp /Volumes/Elements_1/merged_madara.zip kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_gemoma/nama_data
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
|  ..o.o...*   o+ |
|   . . ..= + o* o|
|       .  = oB +.|
|      +.oo .+E+o.|
|      .*S.  o.o.+|
|      .o. .  . .+|
|      ..   +  . o|
|        . ..oo . |
|         . .=.   |
+----[SHA256]-----+
merged_madara.zip 
(fasp) (EDTA2) kosukesano@at138:~/tools/for_gemoma/nama_data$ ls
merged_madara.zip  reference
(fasp) (EDTA2) kosukesano@at138:~/tools/for_gemoma/nama_data$ unzip merged_madara.zip 
Archive:  merged_madara.zip
  inflating: merged_madara.bam       
 
(fasp) (EDTA2) kosukesano@at138:~/tools/for_gemoma/nama_data$ 
(fasp) (EDTA2) kosukesano@at138:~/tools/for_gemoma/nama_data$ ls
merged_madara.bam  merged_madara.zip  reference
(fasp) (EDTA2) kosukesano@at138:~/tools/for_gemoma/nama_data$

1204

scorpion環境下でのGeMoMa

~/tool/for_gemoma/241204_test下でmadara_gemoma.shを作成、qsubで投げた。

### scorpionのmadara_gemoma.shの中身

#$ -S /bin/bash
#$ -cwd

echo start at
date

source /home/dendezia/tool/pyenv_env/gemoma_profile

GeMoMa GeMoMaPipeline \
        t=/home/dendezia/tool/for_gemoma/nama_data/231117_madaragenome.fasta \
        r=NO \
        o=true \
        i=Tcas \
        a=/home/dendezia/tool/for_gemoma/nama_data/reference/Tcas/Tcas_genomic.gff \
        g=/home/dendezia/tool/for_gemoma/nama_data/reference/Tcas/Tcas.fna \
        GeMoMa.Score=ReAlign \
        AnnotationFinalizer.r=NO \
        threads=1 \
        outdir=/home/dendezia/tool/for_gemoma/241204_test/madara_out

echo end at
date

scorpion環境下でのpanther

~/tool/for_pantherを作成、そこで以下のコマンドを実行した。

wget -r  http://data.pantherdb.org/ftp/hmm_scoring/current_release/pantherScore2.2/

1205

scorpion環境下でのpanther続き

こんな感じになった

dendezia@scorpion:~/tool/for_panther$ ls
data.pantherdb.org
dendezia@scorpion:~/tool/for_panther$ ls data.pantherdb.org/
ftp  index.html
dendezia@scorpion:~/tool/for_panther$ ls data.pantherdb.org/ftp/
CellDesigner  biopax         downloads        hmm_classifications  index.html  ortholog          panther_library  peregrine_data  sequence_classifications  tools
TIPS          cSNP_analysis  generic_mapping  hmm_scoring          linkouts    panther_interpro  pathway          pub             tmp                       vsftpd
dendezia@scorpion:~/tool/for_panther$ ls data.pantherdb.org/ftp/panther_library/
11.1  12.0  13.0  13.1  14.0  14.1  15.0  16.0  17.0  18.0  19.0  current_release  index.html
dendezia@scorpion:~/tool/for_panther$ ls data.pantherdb.org/ftp/panther_library/current_release/
index.html
dendezia@scorpion:~/tool/for_panther$ 

なんか思ってたのと違うな。

以下のコマンドを実行

dendezia@scorpion:~/tool/for_panther$ wget -r http://data.pantherdb.org/ftp/hmm_scoring/current_release/pantherScore2.2/lib/

1209

scorpion環境下でのpanther続き

hummerのファイルを持ってきて、解凍。

:~/Downloads$ scp hmmer-3.1b2.tar.gz dendezia@scorpion:/home/dendezia/tool/for_panther
Host key fingerprint is SHA256:KPa37JYErRVG/1YWy31gMOwAs13hHzUeg3opGD75qVY
+--[ED25519 256]--+
|       .+. .=o=+.|
|        o*.o.=.*+|
|       oo.*oo B.o|
|      ..o= +.* ..|
|    o .+S o * .  |
|   . o. .  E     |
|      ....o      |
|       oo+       |
|       o=        |
+----[SHA256]-----+
hmmer-3.1b2.tar.gz                                                                                                                                              100% 5825KB 101.2MB/s   00:00    
:~/Downloads$
tar -zxvf hmmer-3.1b2.tar.gz

これを実行すると、hmmer-3.1b2/ディレクトリができるので、その中に入る。

makeしよう。

dendezia@scorpion:~/tool/for_panther/hmmer-3.1b2$ ./configure

dendezia@scorpion:~/tool/for_panther/hmmer-3.1b2$ make

1212

スロット数とか色々変えたGeMoMa

~/tools/for_gemoma/241212を作成し、そこで色々やった。

kosukesano@at137:~/tools/for_gemoma/241212$ ls
GeMoMa_temp  gemoma_slot_1.sh  gemoma_slot_8.sh  gemoma_slot_8_gpu.sh  madara_gemoma.sh
kosukesano@at137:~/tools/for_gemoma/241212$ 
  • スロット数1medium指定
### gemoma_slot_1.sh

#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 1
#$ -l s_vmem=125G,mem_req=125G
#$ -o ~/results_sh_eando
#$ -e ~/results_sh_eando
ulimit -s unlimited


echo start at
date

source /home/kosukesano/tools/pyenv_env/gemoma_profile


GeMoMa GeMoMaPipeline \
        t=/home/kosukesano/tools/for_softmask/nama_data/231117_madaragenome.fasta \
        r=NO \
        o=true \
        i=Tcas \
        a=/home/kosukesano/tools/for_gemoma/nama_data/reference/Tcas/Tcas_genomic.gff \
        g=/home/kosukesano/tools/for_gemoma/nama_data/reference/Tcas/Tcas.fna \
        GeMoMa.Score=ReAlign \
        AnnotationFinalizer.r=NO \
        threads=1 \
        outdir=/home/kosukesano/tools/for_gemoma/241212/slot_1_out

echo end at
date
  • スロット数8medium指定
### gemoma_slot_8.sh

#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 8
#$ -l s_vmem=125G,mem_req=125G
#$ -o ~/results_sh_eando
#$ -e ~/results_sh_eando
ulimit -s unlimited


echo start at
date

source /home/kosukesano/tools/pyenv_env/gemoma_profile


GeMoMa GeMoMaPipeline \
        t=/home/kosukesano/tools/for_softmask/nama_data/231117_madaragenome.fasta \
        r=NO \
        o=true \
        i=Tcas \
        a=/home/kosukesano/tools/for_gemoma/nama_data/reference/Tcas/Tcas_genomic.gff \
        g=/home/kosukesano/tools/for_gemoma/nama_data/reference/Tcas/Tcas.fna \
        GeMoMa.Score=ReAlign \
        AnnotationFinalizer.r=NO \
        threads=8 \
        outdir=/home/kosukesano/tools/for_gemoma/241212/slot_8_out

echo end at
date
  • スロット数8gpu指定
### gemoma_slot_8_gpu.sh


#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 8
#$ -l s_vmem=125G,mem_req=125G
#$ -o ~/results_sh_eando
#$ -e ~/results_sh_eando
ulimit -s unlimited


echo start at
date

source /home/kosukesano/tools/pyenv_env/gemoma_profile


GeMoMa GeMoMaPipeline \
        t=/home/kosukesano/tools/for_softmask/nama_data/231117_madaragenome.fasta \
        r=NO \
        o=true \
        i=Tcas \
        a=/home/kosukesano/tools/for_gemoma/nama_data/reference/Tcas/Tcas_genomic.gff \
        g=/home/kosukesano/tools/for_gemoma/nama_data/reference/Tcas/Tcas.fna \
        GeMoMa.Score=ReAlign \
        AnnotationFinalizer.r=NO \
        threads=8 \
        outdir=/home/kosukesano/tools/for_gemoma/241212/slot_8_gpu_out

echo end at
date

ローカルでのPyenv実装

pyenvgitでインストール

:~$ git clone https://github.com/yyuu/pyenv.git ~/.pyenv
Cloning into '/Users/kosukesano/.pyenv'...
remote: Enumerating objects: 25324, done.
remote: Counting objects: 100% (2077/2077), done.
remote: Compressing objects: 100% (244/244), done.
remote: Total 25324 (delta 1913), reused 1905 (delta 1822), pack-reused 23247 (from 1)
Receiving objects: 100% (25324/25324), 5.62 MiB | 17.70 MiB/s, done.
Resolving deltas: 100% (17063/17063), done.
:~$ ls -a
.                   .Rapp.history       .bash_profile       .cups               .ncbi               .viminfo            Applications        Library             Public
..                  .Rhistory           .bash_sessions      .docker             .pyenv              .vscode             Desktop             Movies              bin
.CFUserTextEncoding .Trash              .bashrc             .lesshst            .python_history     .vscode-R           Documents           Music               bio
.DS_Store           .bash_history       .config             .local              .ssh                .wget-hsts          Downloads           Pictures            bioinfo
:~$

pyenv_conda_environmentディレクトリを作成し、pyenv_profileを作る。

:~$ mkdir pyenv_conda_environment
:~$ cd pyenv_conda_environment/
:~/pyenv_conda_environment$ nano pyenv_profile
:~/pyenv_conda_environment$ 
### pyenv_profile

export PYENV_ROOT="$HOME/.pyenv"
export PATH="$PYENV_ROOT/bin:$PATH"
eval "$(pyenv init -)"

sourceしてヘルプを見る。実行できてるね。

:~/pyenv_conda_environment$ source pyenv_profile 
:~/pyenv_conda_environment$ pyenv
pyenv 2.4.22-1-ga2ad48aa
Usage: pyenv <command> [<args>]

Some useful pyenv commands are:
   --version   Display the version of pyenv
   commands    List all available pyenv commands
   exec        Run an executable with the selected Python version
   global      Set or show the global Python version(s)
   help        Display help for a command
   hooks       List hook scripts for a given pyenv command
   init        Configure the shell environment for pyenv
   install     Install a Python version using python-build
   latest      Print the latest installed or known version with the given prefix
   local       Set or show the local application-specific Python version(s)
   prefix      Display prefixes for Python versions
   rehash      Rehash pyenv shims (run this after installing executables)
   root        Display the root directory where versions and shims are kept
   shell       Set or show the shell-specific Python version
   shims       List existing pyenv shims
   uninstall   Uninstall Python versions
   version     Show the current Python version(s) and its origin
   version-file   Detect the file that sets the current pyenv version
   version-name   Show the current Python version
   version-origin   Explain how the current Python version is set
   versions    List all Python versions available to pyenv
   whence      List all Python versions that contain the given executable
   which       Display the full path to an executable

See `pyenv help <command>' for information on a specific command.
For full documentation, see: https://github.com/pyenv/pyenv#readme
:~/pyenv_conda_environment$

ローカルでのGeMoMa実装

pyenvmambaforge環境を用意する。

:~/pyenv_conda_environment$ pyenv install mambaforge-23.10.0-0
Downloading Mambaforge-23.10.0-0-MacOSX-x86_64.sh.sh...
-> https://github.com/conda-forge/miniforge/releases/download/23.10.0-0/Mambaforge-23.10.0-0-MacOSX-x86_64.sh
Installing Mambaforge-23.10.0-0-MacOSX-x86_64.sh...
Channels:
 - conda-forge
Platform: osx-64
Collecting package metadata (repodata.json): done
Solving environment: done


==> WARNING: A newer version of conda exists. <==
    current version: 23.10.0
    latest version: 24.11.0

Please update conda by running

    $ conda update -n base -c conda-forge conda



## Package Plan ##

  environment location: /Users/kosukesano/.pyenv/versions/mambaforge-23.10.0-0

  added / updated specs:
    - conda=23.10.0
    - pip


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    ca-certificates-2024.8.30  |       h8857fd0_0         155 KB  conda-forge
    certifi-2024.8.30          |     pyhd8ed1ab_0         160 KB  conda-forge
    openssl-3.4.0              |       hd471939_0         2.5 MB  conda-forge
    pip-24.3.1                 |     pyh8b19718_0         1.2 MB  conda-forge
    ------------------------------------------------------------
                                           Total:         4.0 MB

The following packages will be UPDATED:

  ca-certificates                     2023.11.17-h8857fd0_0 --> 2024.8.30-h8857fd0_0 
  certifi                           2023.11.17-pyhd8ed1ab_0 --> 2024.8.30-pyhd8ed1ab_0 
  openssl                                  3.2.0-hd75f5a5_0 --> 3.4.0-hd471939_0 
  pip                                   23.3.1-pyhd8ed1ab_0 --> 24.3.1-pyh8b19718_0 



Downloading and Extracting Packages:
                                                                                                                                                                                                  
Preparing transaction: done                                                                                                                                                                       
Verifying transaction: done                                                                                                                                                                       
Executing transaction: done                                                                                                                                                                       
Installed Mambaforge-23.10.0-0-MacOSX-x86_64.sh to /Users/kosukesano/.pyenv/versions/mambaforge-23.10.0-0
:~/pyenv_conda_environment$ 

空のgemoma環境の作成

:~/pyenv_conda_environment$ mamba create -n gemoma -y

Looking for: []

Preparing transaction: done
Verifying transaction: done
Executing transaction: done

To activate this environment, use

     $ mamba activate gemoma

To deactivate an active environment, use

     $ mamba deactivate

:~/pyenv_conda_environment$ 

この後、シェルをリセット。

gemoma_profileを作成

### ~/pyenv_conda_environment/gemoma_profileの中身

source ~/.bash_profile
source ~/pyenv_conda_environment/pyenv_profile
pyenv global mambaforge-23.10.0-0



# >>> conda initialize >>>
# !! Contents within this block are managed by 'conda init' !!
__conda_setup="$('/Users/kosukesano/.pyenv/versions/mambaforge-23.10.0-0/bin/conda' 'shell.bash' 'hook' 2> /dev/null)"
if [ $? -eq 0 ]; then
    eval "$__conda_setup"
else
    if [ -f "/Users/kosukesano/.pyenv/versions/mambaforge-23.10.0-0/etc/profile.d/conda.sh" ]; then
        . "/Users/kosukesano/.pyenv/versions/mambaforge-23.10.0-0/etc/profile.d/conda.sh"
    else
        export PATH="/Users/kosukesano/.pyenv/versions/mambaforge-23.10.0-0/bin:$PATH"
    fi
fi
unset __conda_setup
if [ -f "/Users/kosukesano/.pyenv/versions/mambaforge-23.10.0-0/etc/profile.d/mamba.sh" ]; then
    . "/Users/kosukesano/.pyenv/versions/mambaforge-23.10.0-0/etc/profile.d/mamba.sh"
fi
# <<< conda initialize <<<

conda activate gemoma

GeMoMaのインストール

(gemoma) :~/pyenv_conda_environment$ mamba install -c conda-forge -c bioconda gemoma=1.9 -y

Looking for: ['gemoma=1.9']

warning  libmamba Could not parse mod/etag header
warning  libmamba Could not parse mod/etag header
bioconda/osx-64 (check zst)                         Checked  0.4s
bioconda/noarch (check zst)                         Checked  0.0s
bioconda/osx-64                                      3.9MB @   9.0MB/s  0.5s
bioconda/noarch                                      4.4MB @   8.1MB/s  0.6s
conda-forge/noarch                                  17.7MB @  18.6MB/s  1.0s
conda-forge/osx-64                                  35.5MB @  28.6MB/s  1.3s
Transaction

  Prefix: /Users/kosukesano/.pyenv/versions/mambaforge-23.10.0-0/envs/gemoma

  Updating specs:

   - gemoma=1.9
.
.
.
.
.
(gemoma) :~/pyenv_conda_environment$  GeMoMa -h
Searching for the new GeMoMa updates ...
You are using the latest GeMoMa version.

This jar allows to run all parts of GeneModelMapper (GeMoMa) except the external search algorithm (e.g. tblastn).


For more information please visit http://www.jstacs.de/index.php/GeMoMa
If you have any questions, comments or bugs, please check FAQs on our homepage, our github page https://github.com/Jstacs/Jstacs/labels/GeMoMa or contact jens.keilwagen@julius-kuehn.de

If you use this tool, please cite
.
.
.
.
.

できた。

ローカルでのGeMoMa実行

#$ -S /bin/bash
#$ -cwd

echo start at
date

source /Users/kosukesano/pyenv_conda_environment/gemoma_profile

GeMoMa GeMoMaPipeline \
        t=/Users/kosukesano/bio/231117_madaragenome.fasta \
        r=NO \
        o=true \
        i=Tcas \
        a=/Users/kosukesano/bio/for_gemoma/nama_data/reference/Tcas/Tcas_genomic.gff \
        g=/Users/kosukesano/bio/for_gemoma/nama_data/reference/Tcas/Tcas.fna \
        GeMoMa.Score=ReAlign \
        AnnotationFinalizer.r=NO \
        threads=1 \
        outdir=/Users/kosukesano/bio/for_gemoma/241212/madara_out

echo end at
date

これをshで実行した。

1213

scorpionでのGeMoMa実行

#$ -S /bin/bash
#$ -cwd

echo start at
date

# 環境の読み込み
source /home/dendezia/tool/pyenv_env/gemoma_profile

# GeMoMaPipelineの実行
GeMoMa -Xmx100g         GeMoMaPipeline         t=/home/dendezia/tool/for_gemoma/nama_data/231117_madaragenome.fasta         r=NO   o=true i=Tcas a=/home/dendezia/tool/for_gemoma/nama_data/reference/Tcas/Tcas_genomic.gff g=/home/dendezia/tool/for_gemoma/nama_data/reference/Tcas/Tcas.fna GeMoMa.Score=ReAlign AnnotationFinalizer.r=NO threads=4

echo end at
date

これを投げた。改行入れてたら謎のエラーが出て、改行を消したら直った。

同じものをローカルでも動かしておいてる。

デバッグ用にスクリプトもう一個書いた。

### 241213_gemoma_debug.sh



#$ -S /bin/bash
#$ -cwd

echo start at
date

# 環境の読み込み
source /home/dendezia/tool/pyenv_env/gemoma_profile

# GeMoMaPipelineの実行
GeMoMa \
        -Xmx100g \
        GeMoMaPipeline \
        t=/home/dendezia/tool/for_gemoma/nama_data/231117_madaragenome.fasta \
        r=NO \
        o=true \
        i=Tcas \
        a=/home/dendezia/tool/for_gemoma/nama_data/reference/Tcas/Tcas_genomic.gff \
        g=/home/dendezia/tool/for_gemoma/nama_data/reference/Tcas/Tcas.fna \
        GeMoMa.Score=ReAlign \
        AnnotationFinalizer.r=NO \
        threads=10 \
        outdir=/home/dendezia/tool/for_gemoma/241204_test/madara_out_debug

echo end at
date

これを以下のコマンドで投げた

(gemoma) dendezia@scorpion:~/tool/for_gemoma/241204_test$ qsub -l ncpus=10 241213_gemoma_debug.sh 
2075.scorpion
(gemoma) dendezia@scorpion:~/tool/for_gemoma/241204_test$ 

1216

GeMoMa結果

ローカルで実行した方

=============================
Starting: SyntenyChecker (32974.065s)
Finished: SyntenyChecker (32974.407s)
Starting: Extractor for final prediction (32974.407s)
Finished: Extractor for final prediction (33062.396s)

Statistics:
Job     WAITING RUNNING INTERRUPTED     FAILED  SUCCEEDED
---------------------------------------------------------
MmseqsCreateDB  0       0       0       0       1
EREAndFill      0       0       0       0       1
ExtractorAndSplit       0       0       0       0       1
Mmseqs  0       0       0       0       1
GeMoMa  0       0       0       0       1
Cat     0       0       0       0       1
GAF     0       0       0       0       1
AnnotationFinalizer     0       0       0       0       1
Extractor       0       0       0       0       1
SyntenyChecker  0       0       0       0       1

No errors detected.
Elapsed time: 33066 seconds     (9h 11m 6s)
end at
2024年 12月13日 金曜日 23時51分28秒 JST
(gemoma) :~/bio/for_gemoma/241212$ ls
GeMoMa_temp     madara_geoma.sh madara_out
(gemoma) :~/bio/for_gemoma/241212$ ls madara_
ls: madara_: No such file or directory
(gemoma) :~/bio/for_gemoma/241212$ ls madara_out/
final_annotation.gff                      protocol_GeMoMaPipeline.txt               unfiltered_predictions_from_species_0.gff
predicted_proteins.fasta                  reference_gene_table.tabular

これ行けたんじゃね?

### final_annotation.gff の中身

##gff-version 3
#SOFTWARE INFO: GeMoMaPipeline 1.9; SIMPLE PARAMETERS: species: own; ID: Tcas; weight: 1.0; tblastn: false; tag: mRNA; RNA-seq evidence: NO; denoise: DENOISE; DenoiseIntrons.maximum intron length: 15000; DenoiseIntrons.minimum expression: 0.01; DenoiseIntrons.context: 10; Extractor.upcase IDs: false; Extractor.repair: false; Extractor.Ambiguity: AMBIGUOUS; Extractor.discard pre-mature stop: true; Extractor.stop-codon excluded from CDS: false; Extractor.full-length: true; GeMoMa.reads: 1; GeMoMa.splice: true; GeMoMa.gap opening: 11; GeMoMa.gap extension: 1; GeMoMa.maximum intron length: 15000; GeMoMa.static intron length: true; GeMoMa.intron-loss-gain-penalty: 25; GeMoMa.reduction factor: 10; GeMoMa.e-value: 100.0; GeMoMa.contig threshold: 0.4; GeMoMa.hit threshold: 0.9; GeMoMa.output: STATIC; GeMoMa.predictions: 10; GeMoMa.avoid stop: true; GeMoMa.approx: true; GeMoMa.protein alignment: true; GeMoMa.verbose: false; GeMoMa.timeout: 3600; GeMoMa.replace unknown: false; GeMoMa.Score: ReAlign; GAF.default attributes: tie,tde,tae,iAA,pAA,score,lpm,maxGap,bestScore,maxScore,raa,rce; GAF.kmeans: NO; GAF.filter: start=='M' and stop=='*' and (isNaN(score) or score/aa>=0.75); GAF.sorting: sumWeight,score,aa; GAF.alternative transcript filter: tie==1 or sumWeight>1; GAF.common border filter: 0.75; GAF.maximal number of transcripts per gene: 2147483647; GAF.add alternative transcripts: false; GAF.transfer features: false; AnnotationFinalizer.transfer features: false; AnnotationFinalizer.UTR: NO; AnnotationFinalizer.rename: NO; AnnotationFinalizer.name attribute: true; synteny check: true; predicted proteins: true; predicted CDSs: false; predicted genomic regions: false; output individual predictions: true; debug: true; restart: false; BLAST_PATH: ; MMSEQS_PATH: 
##sequence-region ptg000128l_length_99247 1 99247
ptg000128l_length_99247 GAF     gene    17016   17514   .       +       .       ID=gene_10435;transcripts=1;complete=1
ptg000128l_length_99247 GeMoMa  mRNA    17016   17514   .       +       .       ID=Tcas_rna-XM_969231.3_R0;ref-gene=Tcas_gene-LOC663173;aa=94;raa=93;score=408;prediction=0;bestScore=408;ce=2;rce=2;pAA=0.8723;iAA=0.8404;lpm=30;maxScore=487;maxGap=1;nps=0;start=M;stop=*;evidence=1;Parent=gene_10435;sumWeight=1.0;
ptg000128l_length_99247 GeMoMa  CDS     17016   17126   .       +       0       Parent=Tcas_rna-XM_969231.3_R0
ptg000128l_length_99247 GeMoMa  CDS     17344   17514   .       +       0       Parent=Tcas_rna-XM_969231.3_R0
ptg000128l_length_99247 GAF     gene    18035   46865   .       +       .       ID=gene_10436;transcripts=1;complete=1
ptg000128l_length_99247 GeMoMa  mRNA    18035   46865   .       +       .       ID=Tcas_rna-XM_968720.3_R0;ref-gene=Tcas_gene-LOC662636;aa=679;raa=677;score=2580;prediction=0;bestScore=2580;ce=11;rce=8;pAA=0.8175;iAA=0.7372;lpm=59;maxScore=3632;maxGap=4;nps=0;start=M;stop=*;evidence=1;Parent=gene_10436;sumWeight=1.0;
ptg000128l_length_99247 GeMoMa  CDS     18035   18172   .       +       0       Parent=Tcas_rna-XM_968720.3_R0
ptg000128l_length_99247 GeMoMa  CDS     24694   25008   .       +       0       Parent=Tcas_rna-XM_968720.3_R0
ptg000128l_length_99247 GeMoMa  CDS     29089   29349   .       +       0       Parent=Tcas_rna-XM_968720.3_R0
ptg000128l_length_99247 GeMoMa  CDS     29424   29556   .       +       0       Parent=Tcas_rna-XM_968720.3_R0
ptg000128l_length_99247 GeMoMa  CDS     33893   34125   .       +       2       Parent=Tcas_rna-XM_968720.3_R0
ptg000128l_length_99247 GeMoMa  CDS     42340   42590   .       +       0       Parent=Tcas_rna-XM_968720.3_R0
ptg000128l_length_99247 GeMoMa  CDS     42654   42782   .       +       1       Parent=Tcas_rna-XM_968720.3_R0
ptg000128l_length_99247 GeMoMa  CDS     42849   42975   .       +       1       Parent=Tcas_rna-XM_968720.3_R0
ptg000128l_length_99247 GeMoMa  CDS     46296   46458   .       +       0       Parent=Tcas_rna-XM_968720.3_R0
ptg000128l_length_99247 GeMoMa  CDS     46521   46669   .       +       2       Parent=Tcas_rna-XM_968720.3_R0
ptg000128l_length_99247 GeMoMa  CDS     46728   46865   .       +       0       Parent=Tcas_rna-XM_968720.3_R0
.
.
.
.
.

GINGERのインストール

scorpionで行った。

GINGERはアノテーション統合ツールの1つで、EvidenceModelerの代替となりうる(?)ツール

(base) dendezia@scorpion:~/tool$ mamba create -n ginger -y

                  __    __    __    __
                 /  \  /  \  /  \  /  \
                /    \/    \/    \/    \
███████████████/  /██/  /██/  /██/  /████████████████████████
              /  / \   / \   / \   / \  \____
             /  /   \_/   \_/   \_/   \    o \__,
            / _/                       \_____/  `
            |/
        ███╗   ███╗ █████╗ ███╗   ███╗██████╗  █████╗
        ████╗ ████║██╔══██╗████╗ ████║██╔══██╗██╔══██╗
        ██╔████╔██║███████║██╔████╔██║██████╔╝███████║
        ██║╚██╔╝██║██╔══██║██║╚██╔╝██║██╔══██╗██╔══██║
        ██║ ╚═╝ ██║██║  ██║██║ ╚═╝ ██║██████╔╝██║  ██║
        ╚═╝     ╚═╝╚═╝  ╚═╝╚═╝     ╚═╝╚═════╝ ╚═╝  ╚═╝

        mamba (1.1.0) supported by @QuantStack

        GitHub:  https://github.com/mamba-org/mamba
        Twitter: https://twitter.com/QuantStack

█████████████████████████████████████████████████████████████


Looking for: []

Preparing transaction: done
Verifying transaction: done
Executing transaction: done

To activate this environment, use

     $ mamba activate ginger

To deactivate an active environment, use

     $ mamba deactivate

(base) dendezia@scorpion:~/tool$ mamba activate ginger
(ginger) dendezia@scorpion:~/tool$ 

condaで入れようとしたけど、依存ツールが無限に入らず、断念。

gemoma結果のBUSCO

scorpion環境で行った。

### buscoのインストール

(base) dendezia@scorpion:~/tool$ mamba create -n busco -y
(base) dendezia@scorpion:~/tool$ mamba activate busco
(busco) dendezia@scorpion:~/tool$ mamba install -c conda-forge -c bioconda busco=5.8.1 -y
### busco.shの中身

#$ -S /bin/bash
#$ -cwd

echo start at
date

source /home/dendezia/tool/pyenv_env/busco_profile

busco\
        -m protein\
        -i /home/dendezia/tool/for_gemoma/241204_test/madara_out_debug/predicted_proteins.fasta\
        -o /home/dendezia/tool/for_gemoma/241204_test/madara_out_debug/BUSCO_OUTPUT_GEMOMA\
        -l arthropoda_odb10/\
        -f

echo end at
date

結果はこんな感じ

### 241204_test/madara_out_debug/busco.sh.o2077の中身


start at
Mon Dec 16 07:05:13  2024
2024-12-16 07:05:47 INFO:       ***** Start a BUSCO v5.8.1 analysis, current time: 12/16/2024 07:05:47 *****
2024-12-16 07:05:47 INFO:       Configuring BUSCO with local environment
2024-12-16 07:05:47 INFO:       Running proteins mode
2024-12-16 07:05:47 INFO:       Downloading information on latest versions of BUSCO data...
2024-12-16 07:05:48 INFO:       Download connection problem. Retrying in 10 seconds
2024-12-16 07:05:58 INFO:       Download connection problem. Retrying in 100 seconds
2024-12-16 07:07:40 INFO:       Input file is /home/dendezia/tool/for_gemoma/241204_test/madara_out_debug/predicted_proteins.fasta
2024-12-16 07:07:40 INFO:       Downloading file 'https://busco-data.ezlab.org/v5/data/lineages/arthropoda_odb10.2024-01-08.tar.gz'
2024-12-16 07:07:49 INFO:       Decompressing file '/misc/home/dendezia/busco_downloads/lineages/arthropoda_odb10.tar.gz'
2024-12-16 07:08:06 INFO:       Running BUSCO using lineage dataset arthropoda_odb10 (eukaryota, 2024-01-08)
2024-12-16 07:08:06 INFO:       ***** Run HMMER on gene sequences *****
2024-12-16 07:08:06 INFO:       Running 1013 job(s) on hmmsearch, starting at 12/16/2024 07:08:06
2024-12-16 07:08:34 INFO:       [hmmsearch]     102 of 1013 task(s) completed
2024-12-16 07:08:56 INFO:       [hmmsearch]     203 of 1013 task(s) completed
2024-12-16 07:09:14 INFO:       [hmmsearch]     304 of 1013 task(s) completed
2024-12-16 07:09:32 INFO:       [hmmsearch]     406 of 1013 task(s) completed
2024-12-16 07:09:49 INFO:       [hmmsearch]     507 of 1013 task(s) completed
2024-12-16 07:10:23 INFO:       [hmmsearch]     608 of 1013 task(s) completed
2024-12-16 07:10:55 INFO:       [hmmsearch]     710 of 1013 task(s) completed
2024-12-16 07:11:23 INFO:       [hmmsearch]     811 of 1013 task(s) completed
2024-12-16 07:11:47 INFO:       [hmmsearch]     912 of 1013 task(s) completed
2024-12-16 07:12:13 INFO:       [hmmsearch]     1013 of 1013 task(s) completed
2024-12-16 07:12:20 INFO:       

    ---------------------------------------------------
    |Results from dataset arthropoda_odb10             |
    ---------------------------------------------------
    |C:78.3%[S:75.2%,D:3.1%],F:6.4%,M:15.3%,n:1013     |
    |793    Complete BUSCOs (C)                        |
    |762    Complete and single-copy BUSCOs (S)        |
    |31    Complete and duplicated BUSCOs (D)          |
    |65    Fragmented BUSCOs (F)                       |
    |155    Missing BUSCOs (M)                         |
    |1013    Total BUSCO groups searched               |
    ---------------------------------------------------
2024-12-16 07:12:20 INFO:       BUSCO analysis done. Total running time: 279 seconds
2024-12-16 07:12:20 INFO:       Results written in home/dendezia/tool/for_gemoma/241204_test/madara_out_debug/BUSCO_OUTPUT_GEMOMA
2024-12-16 07:12:20 INFO:       For assistance with interpreting the results, please consult the userguide: https://busco.ezlab.org/busco_userguide.html

2024-12-16 07:12:20 INFO:       Visit this page https://gitlab.com/ezlab/busco#how-to-cite-busco to see how to cite BUSCO
end at
Mon Dec 16 07:12:21  2024

うーん低い。これはマスキングしていないからか、それともレファレンスが1種類だけだからか?

マスキングしたデータを使ったGeMoMa

241128にRepeatModelerのde novoデータベースとDfamのデータベースを結合したファイルを元にマスキングした結果がscorpionにあったので、これを使ってGeMoMaをやってみよう。

(busco) dendezia@scorpion:~/tool/for_RepeatMasker_Docker$ ls 241128_madara/output_dir/
231117_madaragenome.fasta.cat.gz  231117_madaragenome.fasta.masked  231117_madaragenome.fasta.out  231117_madaragenome.fasta.out.gff  231117_madaragenome.fasta.tbl
(busco) dendezia@scorpion:~/tool/for_RepeatMasker_Docker/241128_madara/output_dir$ cp 231117_madaragenome.fasta.masked ~/tool/for_gemoma/nama_data/241128_madara_masked.fasta
(busco) dendezia@scorpion:~/tool/for_RepeatMasker_Docker/241128_madara/output_dir$ 

~/tool/for_gemoma/241216を作成、その下で241216_madara_gemoma.shを書いた。

### 241216_madara_gemoma.shの中身


#$ -S /bin/bash
#$ -cwd

echo start at
date

# 環境の読み込み
source /home/dendezia/tool/pyenv_env/gemoma_profile

# GeMoMaPipelineの実行
GeMoMa \
        -Xmx100g \
        GeMoMaPipeline \
        t=/home/dendezia/tool/for_gemoma/nama_data/241128_madara_masked.fasta \
        r=NO \
        o=true \
        i=Tcas \
        a=/home/dendezia/tool/for_gemoma/nama_data/reference/Tcas/Tcas_genomic.gff \
        g=/home/dendezia/tool/for_gemoma/nama_data/reference/Tcas/Tcas.fna \
        GeMoMa.Score=ReAlign \
        AnnotationFinalizer.r=NO \
        threads=10 \
        outdir=/home/dendezia/tool/for_gemoma/241216/241216_madara_out

echo end at
date

qsubで投げた

1217

マスキングしたデータを使ったGeMoMa結果

(busco) dendezia@scorpion:~/tool/for_gemoma/241216$ ls 241216_madara_out/
final_annotation.gff  predicted_proteins.fasta  protocol_GeMoMaPipeline.txt  reference_gene_table.tabular  unfiltered_predictions_from_species_0.gff
(busco) dendezia@scorpion:~/tool/for_gemoma/241216$

できてる。

これをbuscoにかけたらこんな感じ

(busco) dendezia@scorpion:~/tool/for_gemoma/241216$ busco -m protein -i 241216_madara_out/predicted_proteins.fasta -o BUSCO_OUTPUT_GEMOMA -l arthropoda_odb10 -f
2024-12-17 10:19:46 INFO:       ***** Start a BUSCO v5.8.1 analysis, current time: 12/17/2024 10:19:46 *****
2024-12-17 10:19:46 INFO:       Configuring BUSCO with local environment
2024-12-17 10:19:46 INFO:       Running proteins mode
2024-12-17 10:19:46 INFO:       Downloading information on latest versions of BUSCO data...
2024-12-17 10:19:50 INFO:       Input file is /home/dendezia/tool/for_gemoma/241216/241216_madara_out/predicted_proteins.fasta
2024-12-17 10:19:50 INFO:       Downloading file 'https://busco-data.ezlab.org/v5/data/lineages/arthropoda_odb10.2024-01-08.tar.gz'
2024-12-17 10:19:59 INFO:       Decompressing file '/home/dendezia/tool/for_gemoma/241216/busco_downloads/lineages/arthropoda_odb10.tar.gz'
2024-12-17 10:20:02 INFO:       Running BUSCO using lineage dataset arthropoda_odb10 (eukaryota, 2024-01-08)
2024-12-17 10:20:02 INFO:       ***** Run HMMER on gene sequences *****
2024-12-17 10:20:02 INFO:       Running 1013 job(s) on hmmsearch, starting at 12/17/2024 10:20:02
2024-12-17 10:20:17 INFO:       [hmmsearch]     102 of 1013 task(s) completed
2024-12-17 10:20:35 INFO:       [hmmsearch]     203 of 1013 task(s) completed
2024-12-17 10:20:48 INFO:       [hmmsearch]     304 of 1013 task(s) completed
2024-12-17 10:20:59 INFO:       [hmmsearch]     406 of 1013 task(s) completed
2024-12-17 10:21:10 INFO:       [hmmsearch]     507 of 1013 task(s) completed
2024-12-17 10:21:42 INFO:       [hmmsearch]     608 of 1013 task(s) completed
2024-12-17 10:22:13 INFO:       [hmmsearch]     710 of 1013 task(s) completed
2024-12-17 10:22:37 INFO:       [hmmsearch]     811 of 1013 task(s) completed
2024-12-17 10:22:58 INFO:       [hmmsearch]     912 of 1013 task(s) completed
2024-12-17 10:23:20 INFO:       [hmmsearch]     1013 of 1013 task(s) completed
2024-12-17 10:23:21 INFO:

    ---------------------------------------------------
    |Results from dataset arthropoda_odb10             |
    ---------------------------------------------------
    |C:78.3%[S:75.2%,D:3.1%],F:6.4%,M:15.3%,n:1013     |
    |793    Complete BUSCOs (C)                        |
    |762    Complete and single-copy BUSCOs (S)        |
    |31    Complete and duplicated BUSCOs (D)          |
    |65    Fragmented BUSCOs (F)                       |
    |155    Missing BUSCOs (M)                         |
    |1013    Total BUSCO groups searched               |
    ---------------------------------------------------
2024-12-17 10:23:21 INFO:       BUSCO analysis done. Total running time: 211 seconds
2024-12-17 10:23:21 INFO:       Results written in /home/dendezia/tool/for_gemoma/241216/BUSCO_OUTPUT_GEMOMA
2024-12-17 10:23:21 INFO:       For assistance with interpreting the results, please consult the userguide: https://busco.ezlab.org/busco_userguide.html

2024-12-17 10:23:21 INFO:       Visit this page https://gitlab.com/ezlab/busco#how-to-cite-busco to see how to cite BUSCO
(busco) dendezia@scorpion:~/tool/for_gemoma/241216$

seqkitの結果はこう

(busco) dendezia@scorpion:~/tool/for_gemoma/241216$ seqkit stat 241216_madara_out/predicted_proteins.fasta 
file                                        format  type     num_seqs    sum_len  min_len  avg_len  max_len
241216_madara_out/predicted_proteins.fasta  FASTA   Protein    10,463  4,799,632       31    458.7   23,673
(busco) dendezia@scorpion:~/tool/for_gemoma/241216$ 

ソフトマスクの有無で結果が完全に同じ。GeMoMaはマスキングを認識しない?

RNA-seqデータを追加したGeMoMa

奏子先生に頂いたmerged_madara.zipscorpionに転送

(gemoma) :~/bio/for_gemoma/241212$ scp /Volumes/Elements_1/merged_madara.zip dendezia@scorpion:/home/dendezia/tool/for_gemoma/nama_data
Host key fingerprint is SHA256:KPa37JYErRVG/1YWy31gMOwAs13hHzUeg3opGD75qVY
+--[ED25519 256]--+
|       .+. .=o=+.|
|        o*.o.=.*+|
|       oo.*oo B.o|
|      ..o= +.* ..|
|    o .+S o * .  |
|   . o. .  E     |
|      ....o      |
|       oo+       |
|       o=        |
+----[SHA256]-----+
merged_madara.zip                                                                                                                                              100% 6110MB 107.5MB/s   00:56    
(gemoma) :~/bio/for_gemoma/241212$ 

解凍

(gemoma) dendezia@scorpion:~/tool/for_gemoma/nama_data$ ls
231117_madaragenome.fasta  241128_madara_masked.fasta  merged_madara.zip  reference
(gemoma) dendezia@scorpion:~/tool/for_gemoma/nama_data$ unzip merged_madara.zip 
Archive:  merged_madara.zip
  inflating: merged_madara.bam       
(gemoma) dendezia@scorpion:~/tool/for_gemoma/nama_data$ ls
231117_madaragenome.fasta  241128_madara_masked.fasta  merged_madara.bam  merged_madara.zip  reference
(gemoma) dendezia@scorpion:~/tool/for_gemoma/nama_data$ 

~/tool/for_gemoma/241217を作成し、以下で241217_plusRNA.shを記述、qsubで投げた。

### 241217_plusRNA.sh

#$ -S /bin/bash
#$ -cwd

echo start at
date

# 環境の読み込み
source /home/dendezia/tool/pyenv_env/gemoma_profile

# GeMoMaPipelineの実行
GeMoMa \
        -Xmx100g \
        GeMoMaPipeline \
        t=/home/dendezia/tool/for_gemoma/nama_data/241128_madara_masked.fasta \
        o=true \
        i=Tcas \
        a=/home/dendezia/tool/for_gemoma/nama_data/reference/Tcas/Tcas_genomic.gff \
        g=/home/dendezia/tool/for_gemoma/nama_data/reference/Tcas/Tcas.fna \
        GeMoMa.Score=ReAlign \
        AnnotationFinalizer.r=NO \
        threads=10 \
        outdir=/home/dendezia/tool/for_gemoma/241217/241217_plusRNA_out \
        r=MAPPED \
        ERE.s=FR_FIRST_STRAND \
        ERE.m=/home/dendezia/tool/for_gemoma/nama_data/merged_madara.bam \

echo end at
date

2種のゲノムをレファレンスにしたGeMoMa

RNA-seqのデータ入れていない。

#$ -S /bin/bash
#$ -cwd

echo start at
date

# 環境の読み込み
source /home/dendezia/tool/pyenv_env/gemoma_profile

# GeMoMaPipelineの実行
GeMoMa \
        -Xmx100g \
        GeMoMaPipeline \
        t=/home/dendezia/tool/for_gemoma/nama_data/241128_madara_masked.fasta \
        r=NO \
        o=true \
        s=own \
        i=Tcas \
        a=/home/dendezia/tool/for_gemoma/nama_data/reference/Tcas/Tcas_genomic.gff \
        g=/home/dendezia/tool/for_gemoma/nama_data/reference/Tcas/Tcas.fna \
        s=own \
        i=Hsap \
        a=/home/dendezia/tool/for_gemoma/nama_data/reference/Homo_sapiens/ncbi_dataset/data/GCF_000001405.40/genomic.gff \
        g=/home/dendezia/tool/for_gemoma/nama_data/reference/Homo_sapiens/ncbi_dataset/data/GCF_000001405.40/GCF_000001405.40_GRCh38.p14_genomic.fna \
        GeMoMa.Score=ReAlign \
        AnnotationFinalizer.r=NO \
        threads=10 \
        outdir=/home/dendezia/tool/for_gemoma/241217/241217_2sp_out

echo end at
date

1218

RNA-seqも入れたGeMoMa

(gemoma) dendezia@scorpion:~/tool/for_gemoma/241217$ ls 241217_plusRNA_out/
final_annotation.gff  predicted_proteins.fasta  protocol_GeMoMaPipeline.txt  reference_gene_table.tabular  unfiltered_predictions_from_species_0.gff
(gemoma) dendezia@scorpion:~/tool/for_gemoma/241217$

できてそう。

(busco) dendezia@scorpion:~/tool/for_gemoma/241217/241217_plusRNA_out$ seqkit stat predicted_proteins.fasta 
file                      format  type     num_seqs    sum_len  min_len  avg_len  max_len
predicted_proteins.fasta  FASTA   Protein    10,463  4,799,632       31    458.7   23,673
(busco) dendezia@scorpion:~/tool/for_gemoma/241217/241217_plusRNA_out$ 


(busco) dendezia@scorpion:~/tool/for_gemoma/241217/241217_plusRNA_out$ busco -m protein -i predicted_proteins.fasta -o BUSCO_OUTPUT_GEMOMA -l arthropoda_odb10 -f
2024-12-18 11:19:06 INFO:       ***** Start a BUSCO v5.8.1 analysis, current time: 12/18/2024 11:19:06 *****
2024-12-18 11:19:06 INFO:       Configuring BUSCO with local environment
2024-12-18 11:19:06 INFO:       Running proteins mode
2024-12-18 11:19:06 INFO:       Downloading information on latest versions of BUSCO data...
2024-12-18 11:19:10 INFO:       Input file is /home/dendezia/tool/for_gemoma/241217/241217_plusRNA_out/predicted_proteins.fasta
2024-12-18 11:19:10 INFO:       Downloading file 'https://busco-data.ezlab.org/v5/data/lineages/arthropoda_odb10.2024-01-08.tar.gz'
2024-12-18 11:19:19 INFO:       Decompressing file '/home/dendezia/tool/for_gemoma/241217/241217_plusRNA_out/busco_downloads/lineages/arthropoda_odb10.tar.gz'
2024-12-18 11:19:22 INFO:       Running BUSCO using lineage dataset arthropoda_odb10 (eukaryota, 2024-01-08)
2024-12-18 11:19:22 INFO:       ***** Run HMMER on gene sequences *****
2024-12-18 11:19:22 INFO:       Running 1013 job(s) on hmmsearch, starting at 12/18/2024 11:19:22
2024-12-18 11:19:38 INFO:       [hmmsearch]     102 of 1013 task(s) completed
2024-12-18 11:19:56 INFO:       [hmmsearch]     203 of 1013 task(s) completed
2024-12-18 11:20:08 INFO:       [hmmsearch]     304 of 1013 task(s) completed
2024-12-18 11:20:21 INFO:       [hmmsearch]     406 of 1013 task(s) completed
2024-12-18 11:20:31 INFO:       [hmmsearch]     507 of 1013 task(s) completed
2024-12-18 11:21:04 INFO:       [hmmsearch]     608 of 1013 task(s) completed
2024-12-18 11:21:35 INFO:       [hmmsearch]     710 of 1013 task(s) completed
2024-12-18 11:22:00 INFO:       [hmmsearch]     811 of 1013 task(s) completed
2024-12-18 11:22:20 INFO:       [hmmsearch]     912 of 1013 task(s) completed
2024-12-18 11:22:42 INFO:       [hmmsearch]     1013 of 1013 task(s) completed
2024-12-18 11:22:44 INFO:

    ---------------------------------------------------
    |Results from dataset arthropoda_odb10             |
    ---------------------------------------------------
    |C:78.3%[S:75.2%,D:3.1%],F:6.4%,M:15.3%,n:1013     |
    |793    Complete BUSCOs (C)                        |
    |762    Complete and single-copy BUSCOs (S)        |
    |31    Complete and duplicated BUSCOs (D)          |
    |65    Fragmented BUSCOs (F)                       |
    |155    Missing BUSCOs (M)                         |
    |1013    Total BUSCO groups searched               |
    ---------------------------------------------------
2024-12-18 11:22:44 INFO:       BUSCO analysis done. Total running time: 214 seconds
2024-12-18 11:22:44 INFO:       Results written in /home/dendezia/tool/for_gemoma/241217/241217_plusRNA_out/BUSCO_OUTPUT_GEMOMA
2024-12-18 11:22:44 INFO:       For assistance with interpreting the results, please consult the userguide: https://busco.ezlab.org/busco_userguide.html

2024-12-18 11:22:44 INFO:       Visit this page https://gitlab.com/ezlab/busco#how-to-cite-busco to see how to cite BUSCO

予測結果完全に同じなんだが……。なぜえ?

### 241217_plusRNA.sh.e2081の中身

sc - synteny check (run SyntenyChecker if possible, default = true)     = true
p - predicted proteins (If *true*, returns the predicted proteins of the target organism as fastA file, default = true) = true
pc - predicted CDSs (If *true*, returns the predicted CDSs of the target organism as fastA file, default = false)       = false
pgr - predicted genomic regions (If *true*, returns the genomic regions of predicted gene models of the target organism as fastA file, default = false) = false
o - output individual predictions (If *true*, returns the predictions for each reference species, default = false)      = true
debug - debug (If *false* removes all temporary files even if the jobs exits unexpected, default = true)        = true
restart - restart (can be used to restart the latest GeMoMaPipeline run, which was finished without results, with very similar parameters, e.g., after an exception was thrown (cf. parameter debug), default = false)   = false
b - BLAST_PATH (allows to set a path to the blast binaries if not set in the environment, default = , OPTIONAL) = 
m - MMSEQS_PATH (allows to set a path to the blast binaries if not set in the environment, default = , OPTIONAL)        = 
outdir - The output directory, defaults to the current working directory (.)    = /home/dendezia/tool/for_gemoma/241217/241217_plusRNA_out
threads - The number of threads used for the tool, defaults to 1        = 10
[mmseqs]: 16.747c6
java.lang.IllegalStateException: Records A00718:237:HMTKWDSXY:3:1101:1217:1031 (ptg000178c_length_56031:54,907) should come after A00718:237:HMTKWDSXY:3:1101:1217:1031 (ptg000178c_length_56031:54,828) when sorting with htsjdk.samtools.SAMRecordCoordinateComparator
        at htsjdk.samtools.SamReader$AssertingIterator.next(SamReader.java:549)
        at htsjdk.samtools.SamReader$AssertingIterator.next(SamReader.java:519)
        at projects.gemoma.ExtractRNAseqEvidence.run(ExtractRNAseqEvidence.java:526)
        at projects.gemoma.GeMoMaPipeline$JEREAndFill.doJob(GeMoMaPipeline.java:1539)
        at projects.gemoma.GeMoMaPipeline$FlaggedRunnable.run(GeMoMaPipeline.java:1375)
        at java.base/java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:515)
        at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264)
        at java.base/java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:515)
        at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264)
        at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
        at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
        at java.base/java.lang.Thread.run(Thread.java:829)
Check RNA-seq data (introns): 0% of the sequences in the reference genome are covered.
Check RNA-seq data (forward coverage): 0% of the sequences in the reference genome are covered.
Check RNA-seq data (reverse coverage): 0% of the sequences in the reference genome are covered.
Warning: Nashorn engine is planned to be removed from a future JDK release

どうも.bamの中身がよくないらしい。GeMoMaに沿うように作り直す必要がありそう。

2種の参照配列を入れたGeMoMa

### 241217_2sp.sh.e2082 の中身

threads - The number of threads used for the tool, defaults to 1        = 10
[mmseqs]: 16.747c6
[mmseqs]: Segmentation fault (core dumped)
[mmseqs]: Error: Prefilter died
[mmseqs]: Error: Search step died
java.lang.InterruptedException
        at java.base/java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2109)
        at java.base/java.util.concurrent.ThreadPoolExecutor.awaitTermination(ThreadPoolExecutor.java:1454)
        at projects.gemoma.GeMoMaPipeline$1.run(GeMoMaPipeline.java:609)
        at projects.gemoma.GeMoMaPipeline$FlaggedRunnable.run(GeMoMaPipeline.java:1409)
        at java.base/java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:515)
        at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264)
        at java.base/java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:515)
        at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264)
        at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
        at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
        at java.base/java.lang.Thread.run(Thread.java:829)
11 jobs did not finish as expected. Please check the output carefully.
Did not delete temporary files allowing to debug.

java.lang.InterruptedException
        at java.base/java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.reportInterruptAfterWait(AbstractQueuedSynchronizer.java:2056)
        at java.base/java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2133)
        at java.base/java.util.concurrent.ThreadPoolExecutor.awaitTermination(ThreadPoolExecutor.java:1454)
        at projects.gemoma.GeMoMaPipeline$1.run(GeMoMaPipeline.java:609)
        at projects.gemoma.GeMoMaPipeline$FlaggedRunnable.run(GeMoMaPipeline.java:1409)
        at java.base/java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:515)
        at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264)
        at java.base/java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:515)
        at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264)
        at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
        at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
        at java.base/java.lang.Thread.run(Thread.java:829)
Exception in thread "main" java.lang.RuntimeException: Did not finish as intended. 
        at projects.gemoma.GeMoMaPipeline.run(GeMoMaPipeline.java:1234)
        at projects.gemoma.GeMoMaModule.run(GeMoMaModule.java:94)
        at de.jstacs.tools.ui.cli.CLI.run(CLI.java:426)
        at projects.gemoma.GeMoMa.main(GeMoMa.java:399)

メモリ不足(?)でMMseq2が止まったらしい。

メモリを明示してやってみる。

#$ -S /bin/bash
#$ -cwd
#PBS -l select=1:ncpus=10:mem=50gb


echo start at
date

# 環境の読み込み
source /home/dendezia/tool/pyenv_env/gemoma_profile

# GeMoMaPipelineの実行
GeMoMa \
        -Xmx50g \
        GeMoMaPipeline \
        t=/home/dendezia/tool/for_gemoma/nama_data/241128_madara_masked.fasta \
        r=NO \
        o=true \
        s=own \
        i=Tcas \
        a=/home/dendezia/tool/for_gemoma/nama_data/reference/Tcas/Tcas_genomic.gff \
        g=/home/dendezia/tool/for_gemoma/nama_data/reference/Tcas/Tcas.fna \
        s=own \
        i=Hsap \
        a=/home/dendezia/tool/for_gemoma/nama_data/reference/Homo_sapiens/ncbi_dataset/data/GCF_000001405.40/genomic.gff \
        g=/home/dendezia/tool/for_gemoma/nama_data/reference/Homo_sapiens/ncbi_dataset/data/GCF_000001405.40/GCF_000001405.40_GRCh38.p14_genomic.fna \
        GeMoMa.Score=ReAlign \
        AnnotationFinalizer.r=NO \
        threads=10 \
        outdir=/home/dendezia/tool/for_gemoma/241217/241217_2sp_out

echo end at
date

これでもダメだったので、~/tool/for_gemoma/24121810c200g.sh20c500g.shを作成し、実行した。

10c200g.shの方はダメだった。ちなみにヒトゲノムのデータセットが悪い説はないか?

1220

オジロのソフトマスクやり直し

DfamのデータベースとRepeatModelerで作成したデータベースを結合させる。

kosukesano@at137:~/tools/for_RepeatMasker_Docker/nama_data$ cat Dfam-RepeatMasker.lib ~/tools/for_softmask/Ojiro_softmask/RM_3181478.TueOct151949192024/consensi.fa.classified > 241220_for_ojiro.lib
kosukesano@at137:~/tools/for_RepeatMasker_Docker/nama_data$ ls
231117_madaragenome.fasta  241128_for_madara.lib  241220_for_ojiro.lib  Dfam-RepeatMasker.lib
kosukesano@at137:~/tools/for_RepeatMasker_Docker/nama_data$

~/tools/for_RepeatMasker_Docker/241220_ojiroを作成し、その下でojiro_softmask.shを実行する。

#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 24
#$ -l s_vmem=12G
#$ -l mem_req=12G

echo start at
date

apptainer exec /home/kosukesano/tools/for_RepeatMasker_Docker/dfam-tetools_1.sif\
        RepeatMasker\
        -pa 6\
        -s\
        -lib /home/kosukesano/tools/for_RepeatMasker_Docker/nama_data/241220_for_ojiro.lib\
        -dir /home/kosukesano/tools/for_RepeatMasker_Docker/241220_ojiro/output_dir\
        -xsmall\
        -gff\
        /home/kosukesano/tools/for_softmask/nama_data/Release_241005-ojiro_hifiasm/out.p_ctg.fa


echo end at
date

1226

オジロのソフトマスク結果

>ptg000001l
TAGCAGTATCGAGTATAATCATAATATCGTAGTTTTATTGCTAAAACTGT
CCTTTCAACTAATAGTTAGGTATAGATATTCACATATGCATTTTCATTTT
TAAATAAATCTTCGATACTCTGTAATCAATTTCCATTTTTGTTCTATCCC
AAATTATATAAAGTATATAATTTTCTATGTTTTTTTGGTGGAGTGTTCGC
AAAGGGCTGTGACTTGAAGGATGCGTCTTAATCTCGAGGAATATAATGAA
GCAAATGTATCTGCATTAATCTTCTTCTATCTAGTGAGTTGAAATATAAT
GTGGGGTATTATAACAATGACGCAGTAGTAAATAAAAATAAATCAAATCG
ACTTACGTCGATATAAAGTATACTAATTAAAAACATAAAGTCAATCTCGC
AAAAGCAAATATAAGTTAATACATATTAGATATAAATTTGTCCAGATATA
TTAAAATGGCTATTAGTCATTTCTTGACACGGGAtaattaataattaatt
tttcattaaattaaCATACTAAGAAAAACCAGACATCAGACCCAGTTGGT
TTTTCAACTGAAGTGAAACAGTAATCTTAAGCAAATATATCAATAATCTA
ATATGAATTCCTACAAAATTATCTGCTTGAACCTAGAACAAGCTATGCCT
GCGTATATAACTTTAACCAGTTAAGTGACTTCATGCATATATTACTATGA
TTTTAACACCTAATTAGCCTAATGGCTTCTGCTTATGTTCAAAAGATTAC
ATCTAAGTCGATTTTCTTCTCATCGTCATAAGAGGATTAAAATATTCAAA
TTAATAATATCCAGAATGATCAATAAATTAACAAACGAAATTTTAAATTG
CCGTTGATCTAATgtggtaaatgggtattatgtaatatttttcgacaggg
gtggtatgatcgagtaattcgTCAACTAGAAACTACAGTATATATTGTAT
CTGAGCTGAACGAAGTTacagggatatccatataaaagtaatgaatccta
ctttttattcttaaataaacgttatatataaaagttttggctattttgaa
acatttatatatcttacaaccaaaataattgtgcaacgaataatgaagta
gtataaaacatcgattttcttgcttacttaaattggacggtatggttttt

できてそう

これをBRAKER3用の生データディレクトリにコピー。

kosukesano@at138:~/tools/for_braker/nama_data$ cp ~/tools/for_RepeatMasker_Docker/241220_ojiro/output_dir/out.p_ctg.fa.masked 241226_ojiro_masked.fa
kosukesano@at138:~/tools/for_braker/nama_data$ ls
231117_Madara_softmasked.fasta                   Dfro.fna                                            Pst_NotUseEDTA_upper10000.fna  femo_busco.sh.po26221930
241017_Ojiro_masked.fa                           Dval.fna                                            Pst_NotUseEDTA_upper5000.fna   kohuki_busco.sh
241120_madara_dfam.fasta                         Ekam_NotUseEDTA.fna                                 Sfem_RNAseq                    kohuki_busco.sh.e26238968
241127_madara_DockerRM.fasta                     Ekam_oomoji.fna                                     Sfem_pilon_softmasked.fasta    kohuki_busco.sh.o26238968
241127_madara_dfam_RM_data_NotUsedBuildDB.fasta  Elaeidobius_kamerunicus.masked.fna                  Sfem_softmasked.fasta          kohuki_busco.sh.pe26238968
241127_madara_dfam_RMdata_buildDB.fasta          GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked.gz  busco_downloads                kohuki_busco.sh.po26238968
241129_madara_dfamplusbuilddb.fasta              Madara_RNAseq                                       femo_busco.sh                  kohuki_softmasked.fasta
241226_ojiro_masked.fa                           Ojiro_RNAseq                                        femo_busco.sh.e26221930        kohuki_softmasked_upper1000.fasta
BUSCO_OUTPUT_FEMO_GENOME                         Pst_NotUseEDTA.fna                                  femo_busco.sh.o26221930        length.txt
BUSCO_OUTPUT_KOHUKI_GENOME                       Pst_NotUseEDTA_upper1000.fna                        femo_busco.sh.pe26221930       madaralength.txt
kosukesano@at138:~/tools/for_braker/nama_data$ 

オジロのBRAKER3再実行

~/tools/for_braker/241226_ojiroを作成し、そこで以下のスクリプトを作成。

### ojiro_braker.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 16
#$ -l s_vmem=12G
#$ -l mem_req=12G
echo start at
date

source /home/kosukesano/tools/pyenv_env/braker_profile

braker.pl --genome=/home/kosukesano/tools/for_braker/nama_data/241226_ojiro_masked.fa\
        --prot_seq=/home/kosukesano/tools/Arthropoda.fa\
        --rnaseq_sets_ids=ojiro-female_1,ojiro-female_2,ojiro-male_1,ojiro-male_2,ojiro-larva_1,ojiro-larva_2,\
        ojiro_E1_1,ojiro_E1_2,ojiro_E2_1,ojiro_E2_2,ojiro_E3_1,ojiro_E3_2,ojiro_E4_1,ojiro_E4_2,\
        ojiro_H1_1,ojiro_H1_2,ojiro_H2_1,ojiro_H2_2,ojiro_H3_1,ojiro_H3_2,ojiro_H4_1,ojiro_H4_2,\
        ojiro_L1_1,ojiro_L1_2,ojiro_L2_1,ojiro_L2_2,ojiro_L3_1,ojiro_L3_2,ojiro_L4_1,ojiro_L4_2,\
        ojiro_O1_1,ojiro_O1_2,ojiro_O2_1,ojiro_O2_2,ojiro_O3_1,ojiro_O3_2,ojiro_O4_1,ojiro_O4_2,\
        ojiro_T1_1,ojiro_T1_2,ojiro_T2_1,ojiro_T2_2,ojiro_T3_1,ojiro_T3_2,ojiro_T4_1,ojiro_T4_2\
        --rnaseq_sets_dir=/home/kosukesano/tools/for_braker/nama_data/Ojiro_RNAseq\
        --threads=16\
        --species=Ojiro_241226\
        --AUGUSTUS_CONFIG_PATH=/usr/share/augustus/config\
        --AUGUSTUS_BIN_PATH=/usr/bin\
        --AUGUSTUS_SCRIPTS_PATH=/usr/share/augustus/scripts\
        --GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin\
        --PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin\
        --TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin

echo end at
date

遺伝研でのGeMoMa実行

~/tools/for_gemoma/241226ディレクトリを作成、以下でTcasのみをレファレンスとしたgemoma_tcas.shを作成した。

#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 10
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date

# 環境の読み込み
source /home/kosukesano/tools/pyenv_env/gemoma_profile

# GeMoMaPipelineの実行
GeMoMa \
        -Xmx100g \
        GeMoMaPipeline \
        t=/home/kosukesano//tools/for_braker/nama_data/231117_Madara_softmasked.fasta \
        r=NO \
        o=true \
        i=Tcas \
        a=/home/kosukesano/tools/for_gemoma/nama_data/reference/Tcas/Tcas_genomic.gff \
        g=/home/kosukesano/tools/for_gemoma/nama_data/reference/Tcas/Tcas.fna \
        GeMoMa.Score=ReAlign \
        AnnotationFinalizer.r=NO \
        threads=10 \
        outdir=/home/kosukesano/tools/for_gemoma/241226/tcas_out

echo end at
date

TcasとDmelをレファレンスにしたスクリプトgemoma_dmel_tcas.shも併せて実行した。

#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 10
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date

# 環境の読み込み
source /home/kosukesano/tools/pyenv_env/gemoma_profile

# GeMoMaPipelineの実行
GeMoMa \
        -Xmx100g \
        GeMoMaPipeline \
        t=/home/kosukesano//tools/for_braker/nama_data/231117_Madara_softmasked.fasta \
        r=NO \
        o=true \
        s=own \
        i=Tcas \
        a=/home/kosukesano/tools/for_gemoma/nama_data/reference/Tcas/Tcas_genomic.gff \
        g=/home/kosukesano/tools/for_gemoma/nama_data/reference/Tcas/Tcas.fna \
        s=own \
        i=Dmel \
        a=/home/kosukesano/tools/for_gemoma/nama_data/reference/Dmel/genomic.gff \
        g=/home/kosukesano/tools/for_gemoma/nama_data/reference/Dmel/GCF_000001215.4_Release_6_plus_ISO1_MT_genomic.fna \
        GeMoMa.Score=ReAlign \
        AnnotationFinalizer.r=NO \
        threads=10 \
        outdir=/home/kosukesano/tools/for_gemoma/241226/dmel_tcas_out

echo end at
date

2025年1月

0107

GeMoMa続き

遺伝研のジョブがqwのまま進まない。

とりあえずintelでも入れてみる。


#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 10
#$ -l s_vmem=64G
#$ -l mem_req=64G

echo start at
date

# 環境の読み込み
source /home/kosukesano/tools/pyenv_env/gemoma_profile

# GeMoMaPipelineの実行
GeMoMa \
        -Xmx64g \
        GeMoMaPipeline \
        t=/home/kosukesano//tools/for_braker/nama_data/231117_Madara_softmasked.fasta \
        r=NO \
        o=true \
        i=Tcas \
        a=/home/kosukesano/tools/for_gemoma/nama_data/reference/Tcas/Tcas_genomic.gff \
        g=/home/kosukesano/tools/for_gemoma/nama_data/reference/Tcas/Tcas.fna \
        GeMoMa.Score=ReAlign \
        AnnotationFinalizer.r=NO \
        threads=10 \
        outdir=/home/kosukesano/tools/for_gemoma/250107/tcas_out

echo end at
date

SSHの設定変更

(base) :~/Desktop/notebook$ ssh-keygen -R gw.ddbj.nig.ac.jp
# Host gw.ddbj.nig.ac.jp found: line 1
# Host gw.ddbj.nig.ac.jp found: line 5
/Users/kosukesano/.ssh/known_hosts updated.
Original contents retained as /Users/kosukesano/.ssh/known_hosts.old
(base) :~/Desktop/notebook$ 

0110

GINGERのインストール

Dockerイメージを使って入れた。Docker pullを使うとエラー吐かれるので注意

### 失敗例
kosukesano@at139:~/tools/for_ginger$ docker pull i10labtitech/tools:GINGER_v1.0.1
Cannot connect to the Docker daemon at unix:///var/run/docker.sock. Is the docker daemon running?
kosukesano@at139:~/tools/for_ginger$

Dockerdaemonが悪さをするらしい。

### 多分うまくいったやつ?
kosukesano@at139:~/tools/for_ginger$ apptainer pull docker://i10labtitech/tools:GINGER_v1.0.1
INFO:    Converting OCI blobs to SIF format
INFO:    Starting build...
Getting image source signatures
Copying blob 482bd95e477d done   | 
Copying blob 1bc677758ad7 done   | 
Copying config 6590a7c3cd done   | 
Writing manifest to image destination
2025/01/10 17:22:58  info unpack layer: sha256:1bc677758ad7fa4503417ae5be18809c5a8679b5b36fcd1464d5a8e41cb13305
2025/01/10 17:22:59  info unpack layer: sha256:482bd95e477d05637df1423052f7034a88d402126f6ec0a2ae7a6165e9891dab
2025/01/10 17:25:01  warn rootless{usr/local/src/trinityrnaseq-v2.15.0/trinity-plugins/Trimmomatic-0.36/trimmomatic.jar} ignoring (usually) harmless EPERM on setxattr "user.rootlesscontainers"
2025/01/10 17:25:02  warn rootless{usr/local/src/trinityrnaseq-v2.15.1/trinity-plugins/Trimmomatic-0.36/trimmomatic.jar} ignoring (usually) harmless EPERM on setxattr "user.rootlesscontainers"
2025/01/10 17:25:02  warn rootless{usr/local/src/trinityrnaseq-v2.15.1/trinity-plugins/bamsifter/htslib/build/lib/libhts.so} ignoring (usually) harmless EPERM on setxattr "user.rootlesscontainers"
2025/01/10 17:25:02  warn rootless{usr/local/src/trinityrnaseq-v2.15.1/trinity-plugins/bamsifter/htslib/build/lib/libhts.so.3} ignoring (usually) harmless EPERM on setxattr "user.rootlesscontainers"
2025/01/10 17:25:02  warn rootless{usr/local/src/trinityrnaseq-v2.15.1/trinity-plugins/bamsifter/htslib/htscodecs.mk} ignoring (usually) harmless EPERM on setxattr "user.rootlesscontainers"
2025/01/10 17:25:02  warn rootless{usr/local/src/trinityrnaseq-v2.15.1/trinity-plugins/bamsifter/htslib/libhts.so.3} ignoring (usually) harmless EPERM on setxattr "user.rootlesscontainers"
INFO:    Creating SIF file...
kosukesano@at139:~/tools/for_ginger$

0121

コフキゾウムシのソフトマスク

コフキゾウムシゲノムの生データがこれ

kosukesano@at139:~/tools/for_softmask/kohuki_softmask$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat -a 180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta
file                                                          format  type   num_seqs        sum_len  min_len  avg_len  max_len  Q1   Q2   Q3  sum_gap     N50  Q20(%)  Q30(%)  GC(%)
180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta  FASTA   DNA   2,372,896  3,664,337,660       48  1,544.2  151,585  86  100  363        0  15,058       0       0  32.29
kosukesano@at139:~/tools/for_softmask/kohuki_softmask$ 

ゲノム全長が3Gbpで、短い配列がめちゃくちゃ多い。

1000bp未満を切り落とす

kosukesano@at139:~/tools/for_softmask/kohuki_softmask$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit seq -m 1000 180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta > 250121_kohuki_upper1000.fasta
[WARN] you may switch on flag -g/--remove-gaps to remove spaces
kosukesano@at139:~/tools/for_softmask/kohuki_softmask$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat -a 250121_kohuki_upper1000.fasta 
file                           format  type  num_seqs        sum_len  min_len  avg_len  max_len     Q1     Q2      Q3  sum_gap     N50  Q20(%)  Q30(%)  GC(%)
250121_kohuki_upper1000.fasta  FASTA   DNA    397,892  3,349,012,532    1,000  8,416.9  151,585  1,967  4,187  10,292        0  17,050       0       0  32.27
kosukesano@at139:~/tools/for_softmask/kohuki_softmask$

めちゃくちゃコンティグが減った。これを使ってソフトマスクを行う。

~/tools/for_softmask/250121_Kohuki_softmaskディレクトリを作成、そこでまずBLASTデータベースを作る。

(EDTA2) kosukesano@at138:~/tools/for_softmask/250121_Kohuki_softmask$ BuildDatabase -name Kohuki_denovo_DB ../kohuki_softmask/250121_kohuki_upper1000.fasta
Building database Kohuki_denovo_DB:
  Reading ../kohuki_softmask/250121_kohuki_upper1000.fasta...
Number of sequences (bp) added to database: 397892 ( 3349012532 bp )
(EDTA2) kosukesano@at138:~/tools/for_softmask/250121_Kohuki_softmask$ ls
Kohuki_RepeatModeler.sh  Kohuki_denovo_DB.nin  Kohuki_denovo_DB.nnd  Kohuki_denovo_DB.nog  Kohuki_denovo_DB.translation
Kohuki_denovo_DB.nhr     Kohuki_denovo_DB.njs  Kohuki_denovo_DB.nni  Kohuki_denovo_DB.nsq
(EDTA2) kosukesano@at138:~/tools/for_softmask/250121_Kohuki_softmask$ 

続いてこれを元にRepeatModelerを実行する

### Kohuki_RepeatModeler.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 24
#$ -l s_vmem=12G
#$ -l mem_req=12G
echo start at
date

source ~/tools/pyenv_env/EDTA_profile

RepeatModeler -database Kohuki_denovo_DB -pa 6

echo end at
date

これをqsubで投げた。

フェモラータのソフトマスク

フェモラータゲノムの生データがこれ

(EDTA2) kosukesano@at138:~/tools/for_softmask/nama_data$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat -a Sfem_assembly.fasta
file                 format  type  num_seqs      sum_len  min_len   avg_len    max_len       Q1     Q2     Q3  sum_gap        N50  Q20(%)  Q30(%)  GC(%)
Sfem_assembly.fasta  FASTA   DNA      5,084  495,627,753       26  97,487.8  7,760,786  1,769.5  3,258  6,975        0  1,228,127       0       0  36.71
(EDTA2) kosukesano@at138:~/tools/for_softmask/nama_data$ 

同じく1000bp未満を切る

(EDTA2) kosukesano@at138:~/tools/for_softmask/nama_data$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit seq -m 1000 Sfem_assembly.fasta > 250121_Sfem_upper1000.fasta
[WARN] you may switch on flag -g/--remove-gaps to remove spaces
(EDTA2) kosukesano@at138:~/tools/for_softmask/nama_data$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat -a 250121_Sfem_upper1000.fasta 
file                         format  type  num_seqs      sum_len  min_len    avg_len    max_len     Q1     Q2      Q3  sum_gap        N50  Q20(%)  Q30(%)  GC(%)
250121_Sfem_upper1000.fasta  FASTA   DNA      4,530  495,258,564    1,001  109,328.6  7,760,786  2,211  3,625  12,603        0  1,228,127       0       0  36.71
(EDTA2) kosukesano@at138:~/tools/for_softmask/nama_data$

そんなに変わらなかった。

~/tools/for_softmask/250121_Sfem_softmaskディレクトリを作成、そこでBLASTデータベースを作る。

(EDTA2) kosukesano@at138:~/tools/for_softmask/250121_Sfem_softmask$ BuildDatabase -name Sfem_denovo_DB ../nama_data/250121_Sfem_upper1000.fasta 
Building database Sfem_denovo_DB:
  Reading ../nama_data/250121_Sfem_upper1000.fasta...
Number of sequences (bp) added to database: 4530 ( 495258564 bp )
(EDTA2) kosukesano@at138:~/tools/for_softmask/250121_Sfem_softmask$ ls
Sfem_denovo_DB.nhr  Sfem_denovo_DB.nin  Sfem_denovo_DB.njs  Sfem_denovo_DB.nnd  Sfem_denovo_DB.nni  Sfem_denovo_DB.nog  Sfem_denovo_DB.nsq  Sfem_denovo_DB.translation
(EDTA2) kosukesano@at138:~/tools/for_softmask/250121_Sfem_softmask$ 

続いてこれを元にRepeatModelerを実行する

### Sfem_RepeatModeler.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 24
#$ -l s_vmem=12G
#$ -l mem_req=12G
echo start at
date

source ~/tools/pyenv_env/EDTA_profile
;;pppppl/////
RepeatModeler -database Sfem_denovo_DB -pa 6

echo end at
date

これをqsubで投げた。

0122

2種のソフトマスク

なんか全然入らなかったので、short設定にしてメモリ設定を全部3で割った値に直した。

0123

2種のソフトマスク

round-1で止まっちゃったので、gpuにしてメモリを戻してもう一回かけた。

GINGERのインストール(GitHubから直接落とす)

git cloneを使用してGitHubページから必要なファイル・ディレクトリを全て落とす。

kosukesano@at137:~/tools/for_ginger/250123_test$ git clone https://github.com/i10labtitech/GINGER.git
Cloning into 'GINGER'...
remote: Enumerating objects: 428, done.
remote: Counting objects: 100% (428/428), done.
remote: Compressing objects: 100% (237/237), done.
remote: Total 428 (delta 247), reused 366 (delta 185), pack-reused 0 (from 0)
Receiving objects: 100% (428/428), 1.10 MiB | 1.28 MiB/s, done.
Resolving deltas: 100% (247/247), done.
kosukesano@at137:~/tools/for_ginger/250123_test$

こんなディレクトリができる。

kosukesano@at137:~/tools/for_ginger/250123_test$ ls
GINGER
kosukesano@at137:~/tools/for_ginger/250123_test$ ls GINGER/
AUTHORS  CHANGES  ChangeLog  FAQ  INSTALL  LICENSE  Makefile  README  VERSION  generateSampleData_cel.pl  nextflow.config.user  pipeline  runEvaluatePred.pl  runGINGER.pl  src  util
kosukesano@at137:~/tools/for_ginger/250123_test$ 

GINGERディレクトリに入ってmakeコマンドを実行

kosukesano@at137:~/tools/for_ginger/250123_test/GINGER$ make
cd src/mapping && make all
make[1]: Entering directory '/lustre7/home/kosukesano/tools/for_ginger/250123_test/GINGER/src/mapping'
g++ gff_trimmer.cpp -o gff_trimmer -std=c++0x -O3
g++ exon_num_filter.cpp -o exon_num_filter -std=c++0x -O3
g++ longest_transcript.cpp -o longest_transcript -std=c++0x -O3
g++ repeat_checker.cpp -o repeat_checker -std=c++0x -O3
g++ strand_replace.cpp -o strand_replace -std=c++0x -O3
g++ set_difference.cpp -o set_difference -std=c++0x -O3
g++ tag_trimmer.cpp -o tag_trimmer -std=c++0x -O3
g++ ORF_finder.cpp -o ORF_finder -std=c++0x -O3
install -s \
gff_trimmer exon_num_filter longest_transcript \
repeat_checker strand_replace set_difference \
tag_trimmer ORF_finder ../../util/mapping; \
install -s ORF_finder ../../util/denovo
install: target '../../util/mapping' is not a directory
make[1]: Leaving directory '/lustre7/home/kosukesano/tools/for_ginger/250123_test/GINGER/src/mapping'
cd src/denovo && make all
make[1]: Entering directory '/lustre7/home/kosukesano/tools/for_ginger/250123_test/GINGER/src/denovo'
make[1]: Nothing to be done for 'all'.
make[1]: Leaving directory '/lustre7/home/kosukesano/tools/for_ginger/250123_test/GINGER/src/denovo'
cd src/homology && make all
make[1]: Entering directory '/lustre7/home/kosukesano/tools/for_ginger/250123_test/GINGER/src/homology'
g++ fastarepair.cpp -o fastarepair -std=c++0x -O3
g++ fastarepair2.cpp -o fastarepair2 -std=c++0x -O3
g++ gff_2_proteinfasta.cpp -o gff_2_proteinfasta -std=c++0x -O3
g++ flameshiftgrep.cpp -o flameshiftfilter -std=c++0x -O3
install -s fastarepair fastarepair2 gff_2_proteinfasta flameshiftfilter ../../util/homology
make[1]: Leaving directory '/lustre7/home/kosukesano/tools/for_ginger/250123_test/GINGER/src/homology'
cd src/abinitio && make all
make[1]: Entering directory '/lustre7/home/kosukesano/tools/for_ginger/250123_test/GINGER/src/abinitio'
g++ simple_low_norepeatmask.cpp -o simple_low_norepeatmask -std=c++0x -O3
g++ inframe_stopcodon_exclude.cpp -o inframe_stopcodon_exclude -std=c++0x -O3
g++ makefasta.cpp -o makefasta -std=c++0x -O3
install -s simple_low_norepeatmask inframe_stopcodon_exclude makefasta ../../util/abinitio
make[1]: Leaving directory '/lustre7/home/kosukesano/tools/for_ginger/250123_test/GINGER/src/abinitio'
cd src/merge_phase0 && make all
make[1]: Entering directory '/lustre7/home/kosukesano/tools/for_ginger/250123_test/GINGER/src/merge_phase0'
g++ 190521_gff_editor.cpp -o gff_editor -std=c++0x -O3
In file included from 190521_gff_editor.cpp:33:
function.hpp: In function ‘int filter_fuc(std::string, std::string, std::unordered_map<std::__cxx11::basic_string<char>, std::__cxx11::basic_string<char> >&, int&, int&)’:
function.hpp:272:1: warning: control reaches end of non-void function [-Wreturn-type]
  272 | }
      | ^
g++ row2_rename.cpp -o Row2_rename -std=c++0x -O3
g++ rnaseq_reform.cpp -o RNA-seq_reform -std=c++0x -O3
g++ spaln_reform.cpp -o Spaln_reform -std=c++0x -O3
g++ augustus_reform.cpp -o Augustus_reform -std=c++0x -O3
install -s \
gff_editor Row2_rename RNA-seq_reform Spaln_reform \
Augustus_reform ../../util/merge_phase0
make[1]: Leaving directory '/lustre7/home/kosukesano/tools/for_ginger/250123_test/GINGER/src/merge_phase0'
cd src/merge_phase1 && make all
make[1]: Entering directory '/lustre7/home/kosukesano/tools/for_ginger/250123_test/GINGER/src/merge_phase1'
g++ grouping.cpp -o Grouping -std=c++0x -O3
g++ subgroup_v2.2.cpp -o subgroup -std=c++0x -O3
g++ new_subgroup.cpp -o new_subgroup -std=c++0x -O3
g++ searchalgo.cpp -o Searchalgo -std=c++0x -O3
g++ gff_editor.cpp -o gff_editor -std=c++0x -O3
g++ initial_exon_polish.cpp -o initial_exon_polish -std=c++0x -O3
install -s \
Grouping subgroup new_subgroup Searchalgo \
gff_editor initial_exon_polish ../../util/merge_phase1
make[1]: Leaving directory '/lustre7/home/kosukesano/tools/for_ginger/250123_test/GINGER/src/merge_phase1'
cd src/merge_phase2 && make all
make[1]: Entering directory '/lustre7/home/kosukesano/tools/for_ginger/250123_test/GINGER/src/merge_phase2'
g++ geneadd_v191115.cpp -o geneadd_v191115 -std=c++0x -O3
g++ geneadd_v191119.cpp -o geneadd_v191119 -std=c++0x -O3
g++ grouping_v1.cpp -o grouping_v1 -std=c++0x -O3
install -s geneadd_v191115 geneadd_v191119 grouping_v1 ../../util/merge_phase2
make[1]: Leaving directory '/lustre7/home/kosukesano/tools/for_ginger/250123_test/GINGER/src/merge_phase2'
cd src/summary && make all
make[1]: Entering directory '/lustre7/home/kosukesano/tools/for_ginger/250123_test/GINGER/src/summary'
g++ final_reform.cpp -o final_reform -std=c++0x -O3
install -s final_reform ../../util/summary
make[1]: Leaving directory '/lustre7/home/kosukesano/tools/for_ginger/250123_test/GINGER/src/summary'
cd src/evaluation && make all
make[1]: Entering directory '/lustre7/home/kosukesano/tools/for_ginger/250123_test/GINGER/src/evaluation'
g++ evaluation4.cpp -o evaluate -std=c++0x -O3
g++ preevaluation.cpp -o preevaluate -std=c++0x -O3
install -s evaluate preevaluate ../../util/evaluation
install: target '../../util/evaluation' is not a directory
make[1]: *** [Makefile:12: install] Error 1
make[1]: Leaving directory '/lustre7/home/kosukesano/tools/for_ginger/250123_test/GINGER/src/evaluation'
make: *** [Makefile:31: evaluation] Error 2
kosukesano@at137:~/tools/for_ginger/250123_test/GINGER$

なんか知らないエラーが出てるけど気にしない。

この状態でもう既に素のGINGERは動く。

kosukesano@at137:~/tools/for_ginger/250123_test/GINGER$ ./runGINGER.pl --help

Usage: ./runGINGER.pl [Netflow configuration file] 

  --mapping        Preparation phase Mapping-based method only
  --denovo         Preparation phase de novo-based method only
  --homology       Preparation phase homology-based method only
  --abinitio       Preparation phase ab initio-based method only
  --phase0         Merge phase 0 only
  --phase1         Merge phase 1 only
  --phase1manual   Merge phase 1 only, need the threshold of gene 
  --phase2         Merge phase 2 only
  --totalcds       Total CDS minimum length in Merge phase 2 (i.e. threshold)
  --summary        Merge phase summary only
  --help           This help message

./runGINGER.pl [configuration file for user specific settings] at ./runGINGER.pl line 83.
kosukesano@at137:~/tools/for_ginger/250123_test/GINGER$ 

GINGERの実行(できなかった)

kosukesano@at137:~/tools/for_ginger/250123_test/nama_data$ ls
231117_Madara_softmasked.fasta  231117_madaragenome.fasta  231117_madaragenome.fasta.out  adult-1_1.fastq  adult-1_2.fastq
kosukesano@at137:~/tools/for_ginger/250123_test/nama_data$ 

ここに生データを置いた。

~/tools/for_ginger/250123_testnextflow.configをコピーし、編集して実行しようとしたが……。

/****************************************
     Homology based
     ****************************************/

    PDIR_PREP_HOMOLOGY                = "${PDIR_PREP}/homology" // *** No need to edit ***
    PDIR_PREP_HOMOLOGY_HOMOLOGY       = "${PDIR_PREP_HOMOLOGY}/homology" // *** No need to edit ***
    PDIR_PREP_HOMOLOGY_HOMOLOGYMERGE  = "${PDIR_PREP_HOMOLOGY}/homology_merge" // *** No need to edit ***
    PDIR_PREP_HOMOLOGY_HOMOLOGYFILTER = "${PDIR_PREP_HOMOLOGY}/homology_filter" // *** No need to edit ***
    UTILPATH_HOMOLOGY                 = "${GINGER_UTIL}/homology" // *** No need to edit ***

    // --- Tools for homology ---
    SPALN             = "/path/to/spaln" // a full path to Spaln command "spaln"
    MAKEIDX           = "/path/to/makeidx.pl" // a full path to Spaln command "makeidx.pl"
    MAKBLK            = "/path/to/makblk.pl" // a full path to Spaln command "makblk.pl"        

    /****************************************
     Ab initio based
     ****************************************/

    PDIR_PREP_ABINITIO          = "${PDIR_PREP}/abinitio" // *** No need to edit ***
    PDIR_PREP_ABINITIO_AUGUSTUS = "${PDIR_PREP_ABINITIO}/augustus" // *** No need to edit ***
    PDIR_PREP_ABINITIO_SNAP     = "${PDIR_PREP_ABINITIO}/snap" // *** No need to edit ***
    UTILPATH_ABINITIO           = "${GINGER_UTIL}/abinitio" // *** No need to edit ***

    // --- Tools and options related to Augustus ---
    AUGUSTUS_DIR           = "/path/to/augustusSourceTree" // a full path to a directory that Augustus source tree exists
    AUGUSTUS               = "${AUGUSTUS_DIR}/bin/augustus" // *** No need to edit ***
    ETRAINING              = "${AUGUSTUS_DIR}/bin/etraining" // *** No need to edit ***
    AUGUSTUS_SCRIPT_DIR    = "${AUGUSTUS_DIR}/scripts" // *** No need to edit ***
    AUGUSTUS_CONFIG_DIR    = "${AUGUSTUS_DIR}/config" // *** No need to edit ***
    AUGUSTUS_SPEC_DIR      = "${AUGUSTUS_DIR}/config/species" // *** No need to edit ***
    AUGUSTUS_WORK_DIR      = "${PDIR}/augustus_config" // *** No need to edit ***
    AUGUSTUS_SPEC          = "ginger" // a directory name that stores a new trained model
                                      // the name must be unique within "[Augutus root]/config/species/"
    AUGUSTUS_TRAINING_DATA = "${PDIR_PREP_MAPPING_TOLEARN2ND}/${OPREFIX}_learn_2nd.gff3" // *** No need to edit ***
                                      // AUGUSTUS_TRAINING_DATA is used if you run mapping.nf and abinitio.nf separately
    AUGUSTUS_TRAINING_SIZE = 1000  // Number of gene structures for training
    
    // --- Tools and options related to SNAP ---
    SNAP_DIR                = "/path/to/snapBinDir" // a full path to a directory that SNAP binary exists
    FATHOM                  = "${SNAP_DIR}/fathom" // *** No need to edit ***
    FORGE                   = "${SNAP_DIR}/forge" // *** No need to edit ***
    SNAP                    = "${SNAP_DIR}/snap" // *** No need to edit ***

依存パッケージ全部にパス通すの!?キッツイ!

DfamRepeatMasker用データとマダラのゲノムデータをEDTABuildDataBaseでデータベース化したもの結合させ、-libに指定し、DockerRepeatMaskerでマスキングしたマダラケシツブゾウムシのデータ」を使って再度OrthoFinderをかける

1202に最新バージョンのマダラゲノムからアイソフォームを抜いてたようなので、これを使ってOrthoFinderを行う。

kosukesano@at137:~/tools/for_braker/241129_madara$ ls
241129_madara_iso1.aa  busco_1153799319.log  busco_4088551519.log  madara_braker.sh.e27304363   madara_busco.sh.e27312420  madara_busco.sh.o27312452   madara_busco.sh.po27312451
BUSCO_OUTPUT_MADARA    busco_1949637205.log  busco_984679413.log   madara_braker.sh.o27304363   madara_busco.sh.e27312451  madara_busco.sh.pe27312420  madara_busco.sh.po27312452
BUSCO_OUTPUT_MADARA2   busco_2089185273.log  busco_downloads       madara_braker.sh.pe27304363  madara_busco.sh.e27312452  madara_busco.sh.pe27312451
ExIsoform.py           busco_269980639.log   iso1_busco            madara_braker.sh.po27304363  madara_busco.sh.o27312420  madara_busco.sh.pe27312452
braker                 busco_3215571167.log  madara_braker.sh      madara_busco.sh              madara_busco.sh.o27312451  madara_busco.sh.po27312420
kosukesano@at137:~/tools/for_braker/241129_madara$

241129_madara_iso1.aaがアイソフォーム抜いたやつ

続いて、OrthoFinder用のディレクトリを作成し、マダラのゲノムをコピーする

kosukesano@at137:~/tools/for_orthofinder$ mkdir 250123_6sp_iso1
kosukesano@at137:~/tools/for_orthofinder$ cd 250123_6sp_iso1/
kosukesano@at137:~/tools/for_orthofinder/250123_6sp_iso1$ cp ~/tools/for_braker/241129_madara/241129_madara_iso1.aa Smad_iso1.faa
kosukesano@at137:~/tools/for_orthofinder/250123_6sp_iso1$ ls
Smad_iso1.faa
kosukesano@at137:~/tools/for_orthofinder/250123_6sp_iso1$ 

続いて、マダラゲノムデータのヘッダー行を書き換える。edit.pyを使用

kosukesano@at137:~/tools/for_orthofinder/250123_6sp_iso1$ python edit.py 
../250123_6sp_iso1/Smad_iso1.faa に保存しました。
kosukesano@at137:~/tools/for_orthofinder/250123_6sp_iso1$ ls 
Smad_iso1.faa  edit.py
kosukesano@at137:~/tools/for_orthofinder/250123_6sp_iso1$ 

また、昔作ったアイソフォーム1つのファイルが241115_6sp_isoにあるので、全部コピーする。

kosukesano@at137:~/tools/for_orthofinder/250123_6sp_iso1$ cp ../241115_6sp_iso/Cass_iso1.faa ../250123_6sp_iso1/
kosukesano@at137:~/tools/for_orthofinder/250123_6sp_iso1$ cp ../241115_6sp_iso/Sory_iso1.faa ../250123_6sp_iso1/
kosukesano@at137:~/tools/for_orthofinder/250123_6sp_iso1$ cp ../241115_6sp_iso/Dpon_iso1.faa ../250123_6sp_iso1/
kosukesano@at137:~/tools/for_orthofinder/250123_6sp_iso1$ cp ../241115_6sp_iso/Agra_iso1.faa ../250123_6sp_iso1/
kosukesano@at137:~/tools/for_orthofinder/250123_6sp_iso1$ cp ../241115_6sp_iso/Tcas_iso1.faa ../250123_6sp_iso1/
kosukesano@at137:~/tools/for_orthofinder/250123_6sp_iso1$ ls
Agra_iso1.faa  Cass_iso1.faa  Dpon_iso1.faa  Smad_iso1.faa  Sory_iso1.faa  Tcas_iso1.faa  edit.py
kosukesano@at137:~/tools/for_orthofinder/250123_6sp_iso1$

これらを使ってOrthoFinderを実行。以下のスクリプトをqsubで投げた。

### orthofinder_250123.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -pe def_slot 16
#$ -l gpu

echo start at
date


singularity exec /usr/local/biotools/o/orthofinder:2.5.4--hdfd78af_0 orthofinder\
        -f /home/kosukesano/tools/for_orthofinder/250123_6sp_iso1\
        -t 16

echo end at
date

今後はこれを使ってCAFEなりPAMLなりを行なっていく。

0124

OrthoFinder結果

kosukesano@at138:~$ cd tools/for_orthofinder/250123_6sp_iso1/OrthoFinder/Results_Jan23_1/
kosukesano@at138:~/tools/for_orthofinder/250123_6sp_iso1/OrthoFinder/Results_Jan23_1$ ls
Citation.txt                     Gene_Trees            Orthogroups                            Phylogenetically_Misplaced_Genes  Single_Copy_Orthologue_Sequences
Comparative_Genomics_Statistics  Log.txt               Orthologues                            Putative_Xenologs                 Species_Tree
Gene_Duplication_Events          Orthogroup_Sequences  Phylogenetic_Hierarchical_Orthogroups  Resolved_Gene_Trees               WorkingDirectory
kosukesano@at138:~/tools/for_orthofinder/250123_6sp_iso1/OrthoFinder/Results_Jan23_1$

OK、できてる

フェモラータのソフトマスク続き

RepeatModelerが終わった。

kosukesano@at138:~/tools/for_softmask/250121_Sfem_softmask$ ls RM_1746481.ThuJan231122422025/
consensi.fa  consensi.fa.classified  families-classified.stk  families.stk  round-1  round-2  round-3  round-4  round-5  round-6  tmpConsensi.fa
kosukesano@at138:~/tools/for_softmask/250121_Sfem_softmask$

最終出力のconsensi.fa.classifiedもできてる!

これをDfamRepeatMasker用データセットとマージする。

kosukesano@at138:~/tools/for_softmask/250121_Sfem_softmask$ cat ../nama_data/Dfam_RepeatMasker_lib.fasta RM_1746481.ThuJan231122422025/consensi.fa.classified > Sfem_merged.fasta
kosukesano@at138:~/tools/for_softmask/250121_Sfem_softmask$ ls
RM_1746481.ThuJan231122422025    Sfem_RepeatModeler.sh.e27434697   Sfem_RepeatModeler.sh.pe27434697  Sfem_denovo_DB-families.stk  Sfem_denovo_DB.nnd  Sfem_denovo_DB.translation
RM_2307827.WedJan221537242025    Sfem_RepeatModeler.sh.o27433478   Sfem_RepeatModeler.sh.po27433478  Sfem_denovo_DB.nhr           Sfem_denovo_DB.nni  Sfem_merged.fasta
Sfem_RepeatModeler.sh            Sfem_RepeatModeler.sh.o27434697   Sfem_RepeatModeler.sh.po27434697  Sfem_denovo_DB.nin           Sfem_denovo_DB.nog
Sfem_RepeatModeler.sh.e27433478  Sfem_RepeatModeler.sh.pe27433478  Sfem_denovo_DB-families.fa        Sfem_denovo_DB.njs           Sfem_denovo_DB.nsq
kosukesano@at138:~/tools/for_softmask/250121_Sfem_softmask$ 

これを使ってRepeatMaskerをかける。

### Sfem_RepeatMasker.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 24
#$ -l s_vmem=12G
#$ -l mem_req=12G

echo start at
date

apptainer exec /home/kosukesano/tools/for_RepeatMasker_Docker/dfam-tetools_1.sif\
        RepeatMasker\
        -pa 6\
        -s\
        -lib /home/kosukesano/tools/for_softmask/250121_Sfem_softmask/Sfem_merged.fasta\
        -dir /home/kosukesano/tools/for_softmask/250121_Sfem_softmask/output_dir\
        -xsmall\
        -gff\
        /home/kosukesano/tools/for_softmask/nama_data/250121_Sfem_upper1000.fasta


echo end at
date

qsubで実行した。

GINGERの実行

tools_GINGER_v1.0.1.sifapptainerで落としてきたファイルを起動してapptainerの対話コンソールに入る。
次に、git cloneで持ってきたファイル群GINGER/にPATHを通す。
ginger実行

kosukesano@at138:~/tools/for_ginger$ ls
250123_test  generateSampleData_cel.pl  sample  tools_GINGER_v1.0.1.sif  util  workspace
kosukesano@at138:~/tools/for_ginger$ ls 250123_test/
GINGER  nama_data  nextflow.config
kosukesano@at138:~/tools/for_ginger$ ls 250123_test/GINGER/
AUTHORS  CHANGES  ChangeLog  FAQ  INSTALL  LICENSE  Makefile  README  VERSION  generateSampleData_cel.pl  nextflow.config.user  pipeline  runEvaluatePred.pl  runGINGER.pl  src  util
kosukesano@at138:~/tools/for_ginger$ apptainer shell tools_GINGER_v1.0.1.sif
Apptainer> export PATH=$PATH:/home/kosukesano/tools/for_ginger/250123_test/GINGER
Apptainer> runGINGER.pl nextflow.config.user
No configuration file for user specific settings.
(/home/kosukesano/tools/for_ginger/nextflow.config.user) at /home/kosukesano/tools/for_ginger/250123_test/GINGER/runGINGER.pl line 88.
Apptainer> exit
exit
kosukesano@at138:~/tools/for_ginger$ less 250123_test/GINGER/nextflow.config.user 
kosukesano@at138:~/tools/for_ginger$

これでnextflow.config.userをちゃんと整えればワンチャンいけるかもしれへん

0127

コフキゾウムシのソフトマスク続き

kosukesano@at139:~/tools/for_softmask/250121_Kohuki_softmask$ ls RM_106086.ThuJan231122322025/
consensi.fa  consensi.fa.classified  families-classified.stk  families.stk  round-1  round-2  round-3  round-4  round-5  round-6  tmpConsensi.fa
kosukesano@at139:~/tools/for_softmask/250121_Kohuki_softmask$ 

できてそう。

生データを移動

kosukesano@at139:~/tools/for_softmask/250121_Kohuki_softmask$ cp ../kohuki_softmask/250121_kohuki_upper1000.fasta ../nama_data/
kosukesano@at139:~/tools/for_softmask/250121_Kohuki_softmask$ 

RepeatMaskerを実行。

### Kohuki_RepeatMasker_250127.sh


#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 24
#$ -l s_vmem=12G
#$ -l mem_req=12G

echo start at
date

apptainer exec /home/kosukesano/tools/for_RepeatMasker_Docker/dfam-tetools_1.sif\
        RepeatMasker\
        -pa 6\
        -s\
        -lib /home/kosukesano/tools/for_softmask/250121_Kohuki_softmask/Kohuki_merged.fasta\
        -dir /home/kosukesano/tools/for_softmask/250121_Kohuki_softmask/output_dir\
        -xsmall\
        -gff\
        /home/kosukesano/tools/for_softmask/nama_data/250121_kohuki_upper1000.fasta


echo end at
date

これをqsubで投げた

フェモラータのRepeatMasker結果

kosukesano@at139:~/tools/for_softmask/250121_Sfem_softmask/output_dir$ ls
250121_Sfem_upper1000.fasta.cat.gz  250121_Sfem_upper1000.fasta.masked  250121_Sfem_upper1000.fasta.out  250121_Sfem_upper1000.fasta.out.gff  250121_Sfem_upper1000.fasta.tbl
kosukesano@at139:~/tools/for_softmask/250121_Sfem_softmask/output_dir$

できてそう。これを一応BUSCOかけておく。

フェモラータのソフトマスク後ゲノムでのBUSCO

### Sfem_G_BUSCO_250127.sh

#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 12
echo start at
date


date
singularity exec -e /usr/local/biotools/b/busco:5.1.3--pyhdfd78af_0 busco\
        -m geno\
        -i /home/kosukesano/tools/for_softmask/250121_Sfem_softmask/output_dir/250121_Sfem_upper1000.fasta.masked\
        -o /home/kosukesano/tools/for_softmask/250121_Sfem_softmask/output_dir/BUSCO_output_Sfem_genome\
        -l\
        /home/kosukesano/old_envilonment_until20240430/busco_downloads/busco_downloads/lineages/arthropoda_odb10/\
        -f

echo end at
date

メモリ少ないと途中で解析止まっちゃうよ!

結果はこう

### ~/tools/for_softmask/250121_Sfem_softmask/output_dirBUSCO_output_Sfem_genome/run_arthropoda_odb10/short_summary.txt


# BUSCO version is: 5.1.3 
# The lineage dataset is: arthropoda_odb10 (Creation date: 2024-01-08, number of genomes: 90, number of BUSCOs: 1013)
# Summarized benchmarking in BUSCO notation for file /home/kosukesano/tools/for_softmask/250121_Sfem_softmask/output_dir/250121_Sfem_upper1000.fasta.masked
# BUSCO was run in mode: genome
# Gene predictor used: metaeuk

        ***** Results: *****

        C:98.0%[S:97.1%,D:0.9%],F:0.6%,M:1.4%,n:1013       
        993     Complete BUSCOs (C)                        
        984     Complete and single-copy BUSCOs (S)        
        9       Complete and duplicated BUSCOs (D)         
        6       Fragmented BUSCOs (F)                      
        14      Missing BUSCOs (M)                         
        1013    Total BUSCO groups searched                

Dependencies and versions:
        hmmsearch: 3.1
        metaeuk: 4.a0f584d
kosukesano@at137:~/tools/for_softmask/nama_data$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat 250121_Sfem_upper1000.fasta 
file                         format  type  num_seqs      sum_len  min_len    avg_len    max_len
250121_Sfem_upper1000.fasta  FASTA   DNA      4,530  495,258,564    1,001  109,328.6  7,760,786
kosukesano@at137:~/tools/for_softmask/nama_data$

フェモラータのBRAKER3

kosukesano@at137:~/tools/for_braker/nama_data$ mv ~/tools/for_softmask/250121_Sfem_softmask/output_dir/250121_Sfem_upper1000.fasta.masked 250127_Sfem_upper1000_masked.fasta
kosukesano@at137:~/tools/for_braker/nama_data$ ls
231117_Madara_softmasked.fasta                   Dfro.fna                                            Pst_NotUseEDTA_upper5000.fna  kohuki_busco.sh.e26238968
241017_Ojiro_masked.fa                           Dval.fna                                            Sfem_RNAseq                   kohuki_busco.sh.o26238968
241120_madara_dfam.fasta                         Ekam_NotUseEDTA.fna                                 Sfem_pilon_softmasked.fasta   kohuki_busco.sh.pe26238968
241127_madara_DockerRM.fasta                     Ekam_oomoji.fna                                     Sfem_softmasked.fasta         kohuki_busco.sh.po26238968
241127_madara_dfam_RM_data_NotUsedBuildDB.fasta  Elaeidobius_kamerunicus.masked.fna                  busco_downloads               kohuki_softmasked.fasta
241127_madara_dfam_RMdata_buildDB.fasta          GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked.gz  femo_busco.sh                 kohuki_softmasked_upper1000.fasta
241129_madara_dfamplusbuilddb.fasta              Madara_RNAseq                                       femo_busco.sh.e26221930       length.txt
241226_ojiro_masked.fa                           Ojiro_RNAseq                                        femo_busco.sh.o26221930       madaralength.txt
250127_Sfem_upper1000_masked.fasta               Pst_NotUseEDTA.fna                                  femo_busco.sh.pe26221930
BUSCO_OUTPUT_FEMO_GENOME                         Pst_NotUseEDTA_upper1000.fna                        femo_busco.sh.po26221930
BUSCO_OUTPUT_KOHUKI_GENOME                       Pst_NotUseEDTA_upper10000.fna                       kohuki_busco.sh
kosukesano@at137:~/tools/for_braker/nama_data$ 
### Sfem_braker_250127.sh 

#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 16
echo start at
date

source /home/kosukesano/tools/pyenv_env/braker_profile

braker.pl --genome=/home/kosukesano/tools/for_braker/nama_data/250127_Sfem_upper1000_masked.fasta\
        --prot_seq=/home/kosukesano/tools/Arthropoda.fa\
        --rnaseq_sets_ids=Sfem-1_1,femo-larva_1,femo_H1_1,femo_H3_1,femo_L1_1,femo_L3_1,femo_O1_1,femo_O3_1,femo_T1_1,femo_T3_1,Sfem-1_2,femo-larva_2,femo_H1_2,femo_H3_2,femo_L1_2,femo_L3_2,femo_O1_2,femo_O3_2,femo_T1_2,femo_T3_2,femo-female_1,femo-male_1,femo_H2_1,femo_H4_1,femo_L2_1,femo_L4_1,femo_O2_1,femo_O4_1,femo_T2_1,femo_T4_1,femo-female_2,femo-male_2,femo_H2_2,femo_H4_2,femo_L2_2,femo_L4_2,femo_O2_2,femo_O4_2,femo_T2_2,femo_T4_2 \
        --rnaseq_sets_dir=/home/kosukesano/tools/for_braker/nama_data/Sfem_RNAseq\
        --threads=16\
        --species=250127_Sfemorata\
        --AUGUSTUS_CONFIG_PATH=/usr/share/augustus/config\
        --AUGUSTUS_BIN_PATH=/usr/bin\
        --AUGUSTUS_SCRIPTS_PATH=/usr/share/augustus/scripts\
        --GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin\
        --PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin\
        --TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin

echo end at
date

GINGERの実行

~/tools/for_ginger/250123_test250127_nextflow.config.userを作成し、ここで実行しようとしたが以下のエラーが発生。

kosukesano@at137:~/tools/for_ginger/250123_test$ apptainer shell ../tools_GINGER_v1.0.1.sif 
Apptainer> export PATH=$PATH:/home/kosukesano/tools/for_ginger/250123_test/GINGER
Apptainer> runGINGER.pl /home/kosukesano/tools/for_ginger/250123_test/250127_nextflow.config.user 
cp: '/home/kosukesano/tools/for_ginger/250123_test/250127_nextflow.config.user' and '/home/kosukesano/tools/for_ginger/250123_test/250127_nextflow.config.user' are the same file
Apptainer> 

これは遺伝研特有のコピーディレクトリが悪さをしているのでは?

いやいやスクリプトの中身でcpコマンドがあって、こいつが悪さをしているっぽい

0128

Orthofinderのインプットに使ってた種、間違えていないか?

### ~/tools/for_orthofinder/250123_6sp_iso1/OrthoFinder/Results_Jan23_1/Orthogroups/Orthogroups.txtの中身

OG0000000: Agra_P_050299707.1 Cfor_P_060516509.1 Cfor_P_060517001.1 Cfor_P_060518228.1 Cfor_P_060518420.1 Cfor_P_060519135.1 Cfor_P_060519160.1 Cfor_P_060519161.1 Cfor_P_060520792.1 Cfor_P_060527544.1 Cfor_P_060528287.1 Cfor_P_060531558.1 Cfor_P_060531998.1 Cfor_P_060533842.1 Cfor_P_060534539.1 Cfor_P_060534541.1 Cfor_P_060534544.1 Cfor_P_060534546.1 Cfor_P_060535885.1 Cfor_P_060537097.1 Dpon_P_019755307.1 Dpon_P_019762611.2 Dpon_P_019772941.2 Dpon_P_048518566.1 Dpon_P_048519274.1 Dpon_P_048521352.1 Dpon_P_048521747.1 Dpon_P_048523240.1 Smad_g7893.t1 Sory_P_030746543.1 Sory_P_030746551.1 Sory_P_030746552.1 Sory_P_030746582.1 Sory_P_030746655.1 Sory_P_030747073.1 Sory_P_030747074.1 Sory_P_030747075.1 Sory_P_030747801.1 Sory_P_030747802.1 Sory_P_030748222.1 Sory_P_030749813.1 Sory_P_030749814.1 Sory_P_030750813.1 Sory_P_030751507.1 Sory_P_030751509.1 Sory_P_030751544.1 Sory_P_030751614.1 Sory_P_030751739.1 Sory_P_030751799.1 Sory_P_030752348.1 Sory_P_030752696.1 Sory_P_030753012.1 Sory_P_030753074.1 Sory_P_030754214.1 Sory_P_030756675.1 Sory_P_030756676.1 Sory_P_030757554.1 Sory_P_030758072.1 Sory_P_030758322.1 Sory_P_030758536.1 Sory_P_030760004.1 Sory_P_030760368.1 Sory_P_030760810.1 Sory_P_030760811.1 Sory_P_030761017.1 Sory_P_030761914.1 Sory_P_030762002.1 Sory_P_030762174.1 Sory_P_030762175.1 Sory_P_030762745.1 Sory_P_030762746.1 Sory_P_030763129.1 Sory_P_030764236.1 Sory_P_030764263.1 Sory_P_030764314.1 Sory_P_030764610.1 Sory_P_030764983.1 Sory_P_030765172.1 Sory_P_030765480.1 Sory_P_030765760.1 Sory_P_030767016.1 Sory_P_030767532.1 Sory_P_030767905.1

Cfor_P_ってなんだ!?

元は1115のorthofinder、さらにそのインプット元は/home/kosukesano/tools/for_isoform_ex/output_dataだったはず。

### /home/kosukesano/tools/for_isoform_ex/output_data/Cass_iso1.faaの中身

>XP_060524352.1 natterin-3-like [Cylas formicarius]
MAAYYWVDTVARRRVPSTALRGGTDVDGQPIYVGRAFHEGDWIPAKVIPGKQVAYVAYGG
REIPKSQFQVLCEQQFDWVPSRHGHVPPDAVIGGKTSSGENLYIGRVRHRGSHTVGKVHP
SHKCCYIPFDGKEVPHQDYEILVLRG
>XP_060531338.1 uncharacterized protein LOC132704961 [Cylas formicarius]
MRDVAKGTSRQSQRGMSPNSDQSYFERLCPLPYGCACQTTPKGRRTGPCRPRNLDGFLRT
YGFIVTNGSHPVLRTIDAIPKGAGLRLSTLARATKRATRSGKTTRASAKALAGLQ
>XP_060531951.1 uncharacterized protein LOC132705400 isoform X1 [Cylas formicarius]
MYTQTIGWCLFGLLFSGTVLTTLAYPNSQPMPSYRPIRGAPPTLQQVNSVEQMHQERERK
FAEKPNAIKKVALDDLDNVQTNQISESAGGGFSWSNLLGTSYLTPLVNGMLMQMIFNPGG
GVPTGPNKSEGLDDGGVAPSPWANLITMGLKILSAILGGGAAAQNEGIDKVDNGGGSPLQ
GVLAAVVSTMVGGRDPQQVNMLAKQAGEFINIVVNLLDALKTSFSHRSLAARNLGRKDSV
SDAAIAGISMMKGYAKSLGTDESNCMARYMCQANNECSTDIGQSSLFCHLGSYAASFVLD
KATASTTFDLLYEAGRRGRSGDNCQQAYLECNEV

違う種じゃんか〜!!!!

6種のOrthoFinderやり直し

まずディレクトリを作成してCassの生データを移動。

kosukesano@at138:~/tools/for_orthofinder$ mkdir 250128_6sp_iso1
kosukesano@at138:~/tools/for_orthofinder$ cp Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/Cass.fasta 250128_6sp_iso1/
kosukesano@at138:~/tools/for_orthofinder$ cd 250128_6sp_iso1/
kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ ls
Cass.fasta
kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ 


kosukesano@at138:~/old_envilonment_until20240430/other_weevil/Ceutorhynchus_assimilis/ncbi_dataset/data/GCA_917834065.1$ cp genomic.gff ~/tools/for_orthofinder/250128_6sp_iso1/Cass.gff
kosukesano@at138:~/old_envilonment_until20240430/other_weevil/Ceutorhynchus_assimilis/ncbi_dataset/data/GCA_917834065.1$ cd ~/tools/for_orthofinder/250128_6sp_iso1/
kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ ls
Cass.fasta  Cass.gff
kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$


(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ mkdir nama_data
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ ls
Cass.fasta  Cass.gff  nama_data
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ mv Cass.* nama_data/
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ ls
nama_data
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ ls nama_data/
Cass.fasta  Cass.gff
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$

faspを使用してアイソフォームを除去

kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ source ~/tools/for_isoform_ex/fasp/bin/activate
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ 
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ python3 -m fasp exclude_isoforms_by_length nama_data/Cass.fasta Cass_iso1.fasta nama_data/Cass.gff
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ ls
Cass_iso1.fasta  nama_data
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ 

ヘッダーの書き換え

このedit.pyを実行した。

### ~/tools/for_orthofinder/250128_6sp_iso1/edit.py

import os
from Bio import SeqIO

# 入力ディレクトリと出力ディレクトリのパス
input_dir = '/home/kosukesano/tools/for_orthofinder/250128_6sp_iso1/'
output_dir = '../250128_6sp_iso1/Change_hedder/'

# 出力ディレクトリが存在しない場合は作成
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# 入力ディレクトリ内のすべての .faa または .aa ファイルを処理
for input_file in os.listdir(input_dir):
    if input_file.endswith(('.faa', '.aa', '.fasta')):
        input_path = os.path.join(input_dir, input_file)
        output_path = os.path.join(output_dir, input_file)

        # ファイル形式を設定
        format_type = 'fasta'  # Biopython では .faa も .aa も "fasta" 形式として扱う

        with open(output_path, 'w') as outfile:
            for record in SeqIO.parse(input_path, format_type):
                header = record.description
                seq = str(record.seq)
                new_header = ""  # 初期化

                # ヘッダーが「g」で始まる場合
                if header.startswith("g"):
                    number = header.split()[0]  # ヘッダーの最初の番号部分を取得
                    new_header = f">Smad_{number}"

                # ヘッダーが「]」で終わる場合
                elif header.endswith("]"):
                    within_brackets = header.split('[')[-1].split(']')[0]
                    first_letter = within_brackets[0]  # 最初の1文字
                    space_after = within_brackets.split()[-1][:3]  # スペース後の3文字
                    first_part = header.split()[0][1:]
                    new_header = f">{first_letter}{space_after}_{first_part}"

                # それ以外
                else:
                    new_header = f">{header.split()[0]}"

                # 新しいヘッダーと配列を出力ファイルに書き込む
                outfile.write(f"{new_header}\n{seq}\n")

        print(f"{output_path} に保存しました。")

インプットを.fastaにも対応させてる。

(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ python edit.py 
../250128_6sp_iso1/Change_hedder/Cass_iso1.fasta に保存しました。
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ ls
Cass_iso1.fasta  Change_hedder  edit.py  nama_data
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ ls Change_hedder/
Cass_iso1.fasta
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$

他のデータをコピーしてくる

(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ cp ../250123_6sp_iso1/Agra_iso1.faa Change_hedder/
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ cp ../250123_6sp_iso1/Dpon_iso1.faa Change_hedder/
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ cp ../250123_6sp_iso1/Sory_iso1.faa Change_hedder/
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ cp ../250123_6sp_iso1/Smad_iso1.faa Change_hedder/
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ cp ../250123_6sp_iso1/Tcas_iso1.faa Change_hedder/
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ 

(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ cd Change_hedder/
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1/Change_hedder$ mv Cass_iso1.fasta Cass_iso1.faa
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1/Change_hedder$ ls
Agra_iso1.faa  Cass_iso1.faa  Dpon_iso1.faa  Smad_iso1.faa  Sory_iso1.faa  Tcas_iso1.faa
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1/Change_hedder$ 

続いて~/tools/for_orthofinder/250128_6sp_iso1/orthofinder_250128.shを作成、実行した。

### orthofinder_250128.shの中身

#$ -S /bin/bash
#$ -cwd
#$ -pe def_slot 16
#$ -l gpu

echo start at
date


singularity exec /usr/local/biotools/o/orthofinder:2.5.4--hdfd78af_0 orthofinder\
        -f /home/kosukesano/tools/for_orthofinder/250128_6sp_iso1/Change_hedder\
        -t 16

echo end at
date

できてそう

kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ ls Change_hedder/OrthoFinder/Results_Jan28/
Citation.txt                     Gene_Trees            Orthogroups                            Phylogenetically_Misplaced_Genes  Single_Copy_Orthologue_Sequences
Comparative_Genomics_Statistics  Log.txt               Orthologues                            Putative_Xenologs                 Species_Tree
Gene_Duplication_Events          Orthogroup_Sequences  Phylogenetic_Hierarchical_Orthogroups  Resolved_Gene_Trees               WorkingDirectory
kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ 

ローカル環境でのGINGERインストール

遺伝研での実行を諦め、ローカルでDockerを使うことにした。

まずDocker Desktopをインストールしておく。 その後、以下のコマンドを実行。

(base) :~/bio/for_ginger$ docker pull i10labtitech/tools:GINGER_v1.0.1
GINGER_v1.0.1: Pulling from i10labtitech/tools
482bd95e477d: Download complete 
1bc677758ad7: Download complete 
Digest: sha256:8f6de2fc83d99a8df64fcc82cddad1bdca6e0d4175757e629a8ff7da6f106421
Status: Downloaded newer image for i10labtitech/tools:GINGER_v1.0.1
docker.io/i10labtitech/tools:GINGER_v1.0.1
(base) :~/bio/for_ginger$ 
(base) :~/bio/for_ginger$ docker images
REPOSITORY           TAG             IMAGE ID       CREATED         SIZE
hello-world          latest          d715f14f9eca   6 days ago      17kB
dfam/tetools         latest          f60775010b4d   4 months ago    4.18GB
i10labtitech/tools   GINGER_v1.0.1   8f6de2fc83d9   20 months ago   23.6GB
(base) :~/bio/for_ginger$

ちゃんと入ってそう。

docker run -t -i i10labtitech/tools:GINGER_v1.0.1 /bin/bashで実行

(base) :~/bio/for_ginger$ docker run -t -i i10labtitech/tools:GINGER_v1.0.1 /bin/bash
WARNING: The requested image's platform (linux/amd64) does not match the detected host platform (linux/arm64/v8) and no specific platform was requested
(base) root@f9f7fcf458b4:/# pwd
/
(base) root@f9f7fcf458b4:/# ls
GINGER_v1.0.1  bin  boot  data1  data2  dev  etc  home  lib  lib32  lib64  libx32  media  mnt  nextflow  opt  proc  root  run  sbin  scratch  srv  sys  tmp  usr  var
(base) root@f9f7fcf458b4:/#

0129

フェモラータのBRAKER

kosukesano@at138:~/tools/for_braker/250127_Sfem$ ls braker/
Augustus  GeneMark-ETP  braker.aa  braker.codingseq  braker.gtf  braker.log  errors  genome_header.map  hintsfile.gff  species  what-to-cite.txt
kosukesano@at138:~/tools/for_braker/250127_Sfem$
kosukesano@at138:~/tools/for_braker/250127_Sfem/braker$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat braker.aa
file       format  type     num_seqs    sum_len  min_len  avg_len  max_len
braker.aa  FASTA   Protein    14,515  6,313,363        2      435    6,295
kosukesano@at138:~/tools/for_braker/250127_Sfem/braker$ 

できてそうではある

BRAKERでアノテーションつけたフェモラータのBUSCO

これを`qsubで投げた

#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 12
echo start at
date


date
singularity exec -e /usr/local/biotools/b/busco:5.1.3--pyhdfd78af_0 busco\
        -m protein\
        -i /home/kosukesano/tools/for_braker/250127_Sfem/braker/braker.aa\
        -o BUSCO_output\
        -l /home/kosukesano/old_envilonment_until20240430/busco_downloads/busco_downloads/lineages/arthropoda_odb10/\
        -f

echo end at
date

-oのオプションはフルパス通すとエラーになるので注意!

結果がこう

# BUSCO version is: 5.1.3 
# The lineage dataset is:  (Creation date: 2024-01-08, number of genomes: 90, number of BUSCOs: 1013)
# Summarized benchmarking in BUSCO notation for file /home/kosukesano/tools/for_braker/250127_Sfem/braker/braker.aa
# BUSCO was run in mode: proteins

        ***** Results: *****

        C:73.4%[S:56.8%,D:16.6%],F:5.0%,M:21.6%,n:1013     
        743     Complete BUSCOs (C)                        
        575     Complete and single-copy BUSCOs (S)        
        168     Complete and duplicated BUSCOs (D)         
        51      Fragmented BUSCOs (F)                      
        219     Missing BUSCOs (M)                         
        1013    Total BUSCO groups searched                

Dependencies and versions:
        hmmsearch: 3.1

うーん……。ゲノムの時はいい感じだったから、BRAKERの設定が悪いのかなあ。

ローカルのファイルをDockerコンテナで使う練習

テスト用にpythonの環境を立てる

(base) :~/bio/for_ginger$ docker pull python
Using default tag: latest
latest: Pulling from library/python
4cf0e15c283e: Download complete 
e474a4a4cbbf: Download complete 
94c5996c7a64: Download complete 
133055fd9ad7: Download complete 
936252136b92: Download complete 
00fcba8cde0d: Download complete 
d22b85d68f8a: Download complete 
Digest: sha256:137ae4b9f85671bd912a82a19b6966e2655f73e13579b5d6ad4edbddaaf62a9c
Status: Downloaded newer image for python:latest
docker.io/library/python:latest
(base) :~/bio/for_ginger$ docker images
REPOSITORY           TAG             IMAGE ID       CREATED         SIZE
hello-world          latest          d715f14f9eca   7 days ago      17kB
python               latest          137ae4b9f856   11 days ago     1.47GB
dfam/tetools         latest          f60775010b4d   4 months ago    4.18GB
i10labtitech/tools   GINGER_v1.0.1   8f6de2fc83d9   20 months ago   23.6GB
(base) :~/bio/for_ginger$ docker container run -it python
Python 3.13.1 (main, Jan 24 2025, 20:47:48) [GCC 12.2.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> exit
(base) :~/bio/for_ginger$ docker ps -a
CONTAINER ID   IMAGE                              COMMAND       CREATED              STATUS                      PORTS     NAMES
d3c9a7023050   python                             "python3"     About a minute ago   Exited (0) 13 seconds ago             mystifying_goldberg
c680624159e6   dfam/tetools:latest                "/bin/bash"   15 minutes ago       Exited (0) 2 minutes ago              infallible_lehmann
f9f7fcf458b4   i10labtitech/tools:GINGER_v1.0.1   "/bin/bash"   21 hours ago         Exited (0) 18 minutes ago             strange_chaum
7d2bf9d26f8e   i10labtitech/tools:GINGER_v1.0.1   "/bin/bash"   21 hours ago         Exited (0) 21 hours ago               distracted_shockley
8481b2955c5a   i10labtitech/tools:GINGER_v1.0.1   "/bin/bash"   21 hours ago         Exited (0) 21 hours ago               sweet_lederberg
04727df21c92   hello-world                        "/hello"      21 hours ago         Exited (0) 21 hours ago               stupefied_lovelace
73f896bc2cd9   dfam/tetools                       "bash"        2 months ago         Exited (255) 21 hours ago             dfamtet
(base) :~/bio/for_ginger$ 
(base) :~/bio/for_ginger$ docker container restart d3c9a7023050
d3c9a7023050
(base) :~/bio/for_ginger$ docker exec -it charming_mirzakhani bash
Error response from daemon: No such container: charming_mirzakhani
(base) :~/bio/for_ginger$ docker exec -it python bash
Error response from daemon: No such container: python
(base) :~/bio/for_ginger$ docker exec -it d3c9a7023050 bash
root@d3c9a7023050:/# pwd
/
root@d3c9a7023050:/# ls
bin  boot  dev  etc  home  lib  media  memo.txt  mnt  opt  proc  root  run  sbin  srv  sys  tmp  usr  var
root@d3c9a7023050:/# less memo.txt 
bash: less: command not found
root@d3c9a7023050:/#
root@d3c9a7023050:/# cat /etc/issue
Debian GNU/Linux 12 \n \l

root@d3c9a7023050:/# apt-get update
Get:1 http://deb.debian.org/debian bookworm InRelease [151 kB]
Get:2 http://deb.debian.org/debian bookworm-updates InRelease [55.4 kB]
Get:3 http://deb.debian.org/debian-security bookworm-security InRelease [48.0 kB]
Get:4 http://deb.debian.org/debian bookworm/main arm64 Packages [8693 kB]
Get:5 http://deb.debian.org/debian bookworm-updates/main arm64 Packages [13.3 kB]
Get:6 http://deb.debian.org/debian-security bookworm-security/main arm64 Packages [239 kB]
Fetched 9199 kB in 1s (9432 kB/s)                    
Reading package lists... Done
root@d3c9a7023050:/# apt-get install less -y
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  less
0 upgraded, 1 newly installed, 0 to remove and 5 not upgraded.
Need to get 128 kB of archives.
After this operation, 434 kB of additional disk space will be used.
Get:1 http://deb.debian.org/debian bookworm/main arm64 less arm64 590-2.1~deb12u2 [128 kB]
Fetched 128 kB in 0s (2188 kB/s)
debconf: delaying package configuration, since apt-utils is not installed
Selecting previously unselected package less.
(Reading database ... 23992 files and directories currently installed.)
Preparing to unpack .../less_590-2.1~deb12u2_arm64.deb ...
Unpacking less (590-2.1~deb12u2) ...
Setting up less (590-2.1~deb12u2) ...
root@d3c9a7023050:/# less memo.txt 
"memo.txt" may be a binary file.  See it anyway? 
root@d3c9a7023050:/# 

なんかバイナリ扱いされてんだが?

GINGERサンプルデータの取得

Dockerが起動している状態で(?)、ローカルの作業ノードで

perl generateSampleData_cel.py sample
(base) :~/bio/for_ginger/test/GINGER$ ls
AUTHORS                   FAQ                       Makefile                  generateSampleData_cel.pl runEvaluatePred.pl        src
CHANGES                   INSTALL                   README                    nextflow.config.user      runGINGER.pl              util
ChangeLog                 LICENSE                   VERSION                   pipeline                  sample
(base) :~/bio/for_ginger/test/GINGER$ ls sample/
GCA_000180635.4_El_Paco_v._4_translated_cds.faa             GCF_000002985.6_WBcel235_genomic.out
GCF_000002985.6_WBcel235_genomic.commentModified.fna        GCF_000004555.2_CB4_translated_cds.faa
GCF_000002985.6_WBcel235_genomic.commentModified.masked.fna SRR5849934_1.fastq
GCF_000002985.6_WBcel235_genomic.gff                        SRR5849934_2.fastq
(base) :~/bio/for_ginger/test/GINGER$ 

サンプルデータを用いたGINGER

### 250129_test_output/summary.stderrの中身

WARNING: The requested image's platform (linux/amd64) does not match the detected host platform (linux/arm64/v8) and no specific platform was requested
docker: Error response from daemon: Mounts denied: 
The path /scratch is not shared from the host and is not known to Docker.
You can configure shared paths from Docker -> Preferences... -> Resources -> File Sharing.
See https://docs.docker.com/desktop/settings/mac/#file-sharing for more info.

0130

コフキのソフトマスク結果

kosukesano@at137:~/tools/for_softmask/250121_Kohuki_softmask$ ls output_dir/
kosukesano@at137:~/tools/for_softmask/250121_Kohuki_softmask$

結果が出力されていないんだが?

..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
(END)

途中で止まっちゃったっぽい。なぜえ。

6種のゲノムデータを用いた種系統樹推定

  • 1:Manualphylo_dataディレクトリの作成
kosukesano@at137:~/tools/for_orthofinder/250128_6sp_iso1/Change_hedder/OrthoFinder/Results_Jan28$ mkdir Manualphylo_data
kosukesano@at137:~/tools/for_orthofinder/250128_6sp_iso1/Change_hedder/OrthoFinder/Results_Jan28$
  • 2:Manualphylo_1.pyの実行
kosukesano@at137:~/tools/for_orthofinder/250128_6sp_iso1/Change_hedder/OrthoFinder/Results_Jan28/Manualphylo_data$ python Manualphylo_1.py 
     Orthogroup           Agra_iso1         Cass_iso1           Dpon_iso1       Smad_iso1           Sory_iso1           Tcas_iso1
3316  OG0003316  Agra_P_050297705.1  Cass_AG9763147.1  Dpon_P_019756877.1   Smad_g5919.t1  Sory_P_030749172.1  Tcas_P_008190965.1
3318  OG0003318  Agra_P_050313709.1  Cass_AG9759263.1  Dpon_P_019760448.1   Smad_g2942.t1  Sory_P_030746210.1  Tcas_P_008193499.1
3319  OG0003319  Agra_P_050310562.1  Cass_AG9760850.1  Dpon_P_019762920.2    Smad_g874.t1  Sory_P_030761663.1  Tcas_P_008197831.1
3320  OG0003320  Agra_P_050302120.1  Cass_AG9772342.1  Dpon_P_019761695.1   Smad_g4673.t1  Sory_P_030759635.1     Tcas_P_966819.1
3321  OG0003321  Agra_P_050298809.1  Cass_AG9761965.1  Dpon_P_019754246.2   Smad_g9255.t2  Sory_P_030750748.1  Tcas_P_015834610.1
...         ...                 ...               ...                 ...             ...                 ...                 ...
8556  OG0008556  Agra_P_050313166.1  Cass_AG9759544.1  Dpon_P_019753580.1  Smad_g11268.t1  Sory_P_030756213.1  Tcas_P_015834054.1
8557  OG0008557  Agra_P_050299456.1  Cass_AG9767641.1  Dpon_P_019768109.2  Smad_g10715.t1  Sory_P_030747660.1  Tcas_P_008194715.1
8558  OG0008558  Agra_P_050296789.1  Cass_AH1126441.1  Dpon_P_019767524.1   Smad_g3535.t1  Sory_P_030750133.1  Tcas_P_008195711.1
8559  OG0008559  Agra_P_050308325.1  Cass_AG9761889.1  Dpon_P_048517173.1   Smad_g9800.t1  Sory_P_030761000.1     Tcas_P_968816.1
8560  OG0008560  Agra_P_050308491.1  Cass_AH1131527.1  Dpon_P_048526405.1   Smad_g6531.t1  Sory_P_030748824.1  Tcas_P_001811794.1

[4830 rows x 7 columns]
kosukesano@at137:~/tools/for_orthofinder/250128_6sp_iso1/Change_hedder/OrthoFinder/Results_Jan28/Manualphylo_data$ ls
Manualphylo_1.py  OG_list.txt  species_list.txt
kosukesano@at137:~/tools/for_orthofinder/250128_6sp_iso1/Change_hedder/OrthoFinder/Results_Jan28/Manualphylo_data$
  • 3:all_seq.faの作成

fasta_concatinate.shを実行する

kosukesano@at137:~/tools/for_orthofinder/250128_6sp_iso1/Change_hedder/OrthoFinder/Results_Jan28/Manualphylo_data$ sh fasta_concatinate.sh 
start at
Thu Jan 30 15:37:27 JST 2025
Thu Jan 30 15:37:29 JST 2025
kosukesano@at137:~/tools/for_orthofinder/250128_6sp_iso1/Change_hedder/OrthoFinder/Results_Jan28/Manualphylo_data$ ls
Manualphylo_1.py  OG_list.txt  all_seq.fa  fasta_concatinate.sh  species_list.txt
kosukesano@at137:~/tools/for_orthofinder/250128_6sp_iso1/Change_hedder/OrthoFinder/Results_Jan28/Manualphylo_data$
  • 4:Manualphylo_2.pyの実行
    SCOのOG番号ごとにファイルができる、時間かかる

  • 5:align.shの実行
    SCOのファイルがMAFFTによりアライメントされる,時間かかる

  • 6:makealltreeの実行

2025年2月

0203

makealltree.sh出力のファイルを元にしたASTRALの実行

~/tools/for_ASTRAL/Astral/data/250203_6spall_trees.nwkをコピー。

### all_trees.nwkの中身

OG0003316: (Agra_P_050297705.1:0.1651293024,(Cass_AG9763147.1:0.1818482204,(Sory_P_030749172.1:0.1209868450,Tcas_P_008190965.1:0.9786646109)49:0.1585813214)37:0.0878202062,(Dpon_P_019756877.1:0.1886146557,Smad_g5919.t1:0.3120157875)25:0.0602110128);

OG0003318: (Agra_P_050313709.1:0.0315659308,(Cass_AG9759263.1:0.0142661688,Sory_P_030746210.1:0.0106140127)72:0.0047846194,((Dpon_P_019760448.1:0.0204733482,Tcas_P_008193499.1:0.0623490402)46:0.0013761600,Smad_g2942.t1:0.0000010000)41:0.0000010000);

OG0003319: (Agra_P_050310562.1:0.1472894568,((Cass_AG9760850.1:0.1222426475,Smad_g874.t1:0.1761863246)64:0.0413255253,Sory_P_030761663.1:0.1262376730)58:0.0310103463,(Dpon_P_019762920.2:0.1504657030,Tcas_P_008197831.1:0.3955563338)53:0.0738820816);

OG0003320: (Agra_P_050302120.1:0.2407713260,(Cass_AG9772342.1:0.0426387253,Dpon_P_019761695.1:0.2475477632)61:0.0326335634,(Smad_g4673.t1:0.1131079176,(Sory_P_030759635.1:0.2653371088,Tcas_P_966819.1:0.5311370910)99:0.1490033020)61:0.0232625701);

OG0003321: (Agra_P_050298809.1:0.4846044357,(Cass_AG9761965.1:0.6729211459,(Dpon_P_019754246.2:0.3812087652,Smad_g9255.t2:0.1754803069)72:0.0580715184)51:0.0626765693,(Sory_P_030750748.1:0.4116607756,Tcas_P_015834610.1:1.1104186828)35:0.0790941709);
.
.
.
.
.
.

OG番号や遺伝子IDなどの無駄な情報があるので、それらを削除するスクリプトを用意した。

### modify.pyの中身

import re

# 元のファイルと新しいファイルのパスを設定
input_file_path = 'all_trees.nwk'
output_file_path = 'modified_trees.nwk'

# 処理を実行
with open(input_file_path, 'r') as infile, open(output_file_path, 'w') as outfile:
    for line in infile:
        # 行を ': ' で分割し、2つ以上の要素がある場合のみ処理
        parts = line.split(': ', 1)
        if len(parts) > 1:
            modified_line = parts[1]  # `:` 以降の部分のみ取得
        else:
            modified_line = line  # `:` がない場合はそのまま保持

        # ラベルの変換:「四文字の種名_遺伝子名」→「四文字の種名」のみ
        modified_line = re.sub(r"\b([A-Za-z]{4})_[^,():]+", r"\1", modified_line)

        # 新しいファイルに書き込み
        outfile.write(modified_line)

print("ツリーファイルの変換が完了しました:", output_file_path)

これまでのmodify.pyと違って、遺伝子IDも除去するようにスクリプトを改造してある。

この出力のmodified_trees.nwkを使用する。

### ASTRAL.shの中身


#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 16
echo start at
date

java -Xmx2G -jar astral.5.7.8.jar \
    -i /home/kosukesano/tools/for_ASTRAL/Astral/data/250203_6sp/modified_trees.nwk \
    -o /home/kosukesano/tools/for_ASTRAL/Astral/250203_6sp/out.tre \
    2>/home/kosukesano/tools/for_ASTRAL/Astral/250203_6sp/out.log

date

この出力がこれ

kosukesano@at139:~/tools/for_ASTRAL/Astral$ ls 250203_6sp/
out.log  out.tre
kosukesano@at139:~/tools/for_ASTRAL/Astral$
(Agra,((Sory,Tcas)1:0.8510176414897351,(Dpon,(Smad,Cass)0.89:0.0330270621831446)1:0.05640338444435427):0.0);

0204

フェモラータゲノムのBRAKER、RNA-Seqデータ無し

フェモラータのBRAKERのクオリティが低いのはRNA-seqの個体とゲノムの元個体が違う産地で、ちゃんとくっつかなかったからでは?RNA-seqデータを抜いてやってみる。

#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 16
echo start at
date

source /home/kosukesano/tools/pyenv_env/braker_profile

braker.pl --genome=/home/kosukesano/tools/for_braker/nama_data/250127_Sfem_upper1000_masked.fasta\
        --prot_seq=/home/kosukesano/tools/Arthropoda.fa\
        --threads=16\
        --species=250204_Sfemorata\
        --AUGUSTUS_CONFIG_PATH=/usr/share/augustus/config\
        --AUGUSTUS_BIN_PATH=/usr/bin\
        --AUGUSTUS_SCRIPTS_PATH=/usr/share/augustus/scripts\
        --GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin\
        --PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin\
        --TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin

echo end at
date

RNA-Seqのインプットを抜いただけ

0205

RNA-seqデータを抜いたフェモラータのBRAKER結果・BUSCO

kosukesano@at138:~/tools/for_braker/250204_Sfem/braker$ ls
Augustus  GeneMark-EP  GeneMark-ES  braker.aa  braker.codingseq  braker.gtf  braker.log  errors  genome_header.map  hintsfile.gff  prothint.gff  species  what-to-cite.txt
kosukesano@at138:~/tools/for_braker/250204_Sfem/braker$ 
kosukesano@at138:~/tools/for_braker/250204_Sfem/braker$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat braker.aa
file       format  type     num_seqs    sum_len  min_len  avg_len  max_len
braker.aa  FASTA   Protein    17,544  7,428,397        6    423.4   21,411
kosukesano@at138:~/tools/for_braker/250204_Sfem/braker$

シーケンス数が3000くらい増えてるね。

BUSCOの結果はこう

# BUSCO version is: 5.1.3 
# The lineage dataset is:  (Creation date: 2024-01-08, number of genomes: 90, number of BUSCOs: 1013)
# Summarized benchmarking in BUSCO notation for file /home/kosukesano/tools/for_braker/250204_Sfem/braker/braker.aa
# BUSCO was run in mode: proteins

        ***** Results: *****

        C:92.2%[S:85.3%,D:6.9%],F:3.3%,M:4.5%,n:1013       
        934     Complete BUSCOs (C)                        
        864     Complete and single-copy BUSCOs (S)        
        70      Complete and duplicated BUSCOs (D)         
        33      Fragmented BUSCOs (F)                      
        46      Missing BUSCOs (M)                         
        1013    Total BUSCO groups searched                

Dependencies and versions:
        hmmsearch: 3.1

やっぱりRNAーseqのデータが良くなかったっぽいですね

OrthoFinder出力からSCOのCDSを取得

  • 1.ExOG.pyを実行
  • 2.6種のCDSファイルをコピー
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$ cp ~/tools/for_braker/241129_madara/braker/braker.codingseq Smad.fasta
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$ ls
Smad.fasta
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$ ls ~/old_envilonment_until20240430/other_weevil/Anthonomus_grandis_grandis/ncbi_dataset/data/GCF_022605725.1
Anthonomus_buscotest.sh            Anthonomus_buscotest.sh.o25642658   Anthonomus_buscotest.sh.po25642658        busco_downloads  cds_from_genomic.fna  protein.faa  sequence_report.jsonl
Anthonomus_buscotest.sh.e25642658  Anthonomus_buscotest.sh.pe25642658  GCF_022605725.1_icAntGran1.3_genomic.fna  busco_out        genomic.gff           rna.fna
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$ cp ~/old_envilonment_until20240430/other_weevil/Anthonomus_grandis_grandis/ncbi_dataset/data/GCF_022605725.1/cds_from_genomic.fna Agra.fasta
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$ ls
Agra.fasta  Smad.fasta
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$ ls ~/old_envilonment_until20240430/other_weevil/Ceutorhynchus_assimilis/ncbi_dataset/data/GCA_917834065.1
Ceutorhynchus_buscotest.sh            Ceutorhynchus_buscotest.sh.o25642655   Ceutorhynchus_buscotest.sh.po25642655      busco_downloads  cds_from_genomic.fna  protein.faa
Ceutorhynchus_buscotest.sh.e25642655  Ceutorhynchus_buscotest.sh.pe25642655  GCA_917834065.1_PGI_CEUTPL_v4_genomic.fna  busco_out        genomic.gff           sequence_report.jsonl
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$ cp ~/old_envilonment_until20240430/other_weevil/Ceutorhynchus_assimilis/ncbi_dataset/data/GCA_917834065.1/cds_from_genomic.fna Cass.
fasta
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$ ls
Agra.fasta  Cass.fasta  Smad.fasta
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$ ls
Agra.fasta  Cass.fasta  Smad.fasta
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$ ls ~/old_envilonment_until20240430/other_weevil/Soryzae/ncbi_dataset/data/GCF_002938485.1
GCF_002938485.1_Soryzae_2.0_genomic.fna  Soryzae_busco.sh.e26203344  Soryzae_busco.sh.pe26203344  busco_downloads  cds_from_genomic.fna  genomic.gff  protein.faa  sequence_report.jsonl
Soryzae_busco.sh                         Soryzae_busco.sh.o26203344  Soryzae_busco.sh.po26203344  busco_out        genomic.gbff          genomic.gtf  rna.fna
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$ cp ~/old_envilonment_until20240430/other_weevil/Soryzae/ncbi_dataset/data/GCF_002938485.1/cds_from_genomic.fna 
cp: missing destination file operand after '/home/kosukesano/old_envilonment_until20240430/other_weevil/Soryzae/ncbi_dataset/data/GCF_002938485.1/cds_from_genomic.fna'
Try 'cp --help' for more information.
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$ cp ~/old_envilonment_until20240430/other_weevil/Soryzae/ncbi_dataset/data/GCF_002938485.1/cds_from_genomic.fna Sory.fasta
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$ ls
Agra.fasta  Cass.fasta  Smad.fasta  Sory.fasta
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$ ls ~/old_envilonment_until20240430/ronbun_sp/Dendroctonus_ponderosae/ncbi_dataset/data/GCF_020466585.1
GCF_020466585.1_Dpon_F_20191213v2_genomic.fna  cds_from_genomic.fna  genomic.gff  protein.faa  rna.fna  sequence_report.jsonl
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$ cp ~/old_envilonment_until20240430/ronbun_sp/Dendroctonus_ponderosae/ncbi_dataset/data/GCF_020466585.1/cds_from_genomic.fna Dpon.fasta
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$ ls
Agra.fasta  Cass.fasta  Dpon.fasta  Smad.fasta  Sory.fasta
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$ ls ~/old_envilonment_until20240430/outgroup/Tribolium_castaneum/ncbi_dataset/data/GCF_000002335.3
GCF_000002335.3_Tcas5.2_genomic.fna  Tribolium_buscotest.sh.o25642647   busco_downloads       genomic.gff  rna.fna                test.faa
Tribolium_buscotest.sh               Tribolium_buscotest.sh.pe25642647  busco_out             output.faa   sequence_report.jsonl  test.gff
Tribolium_buscotest.sh.e25642647     Tribolium_buscotest.sh.po25642647  cds_from_genomic.fna  protein.faa  test
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$ cp ~/old_envilonment_until20240430/outgroup/Tribolium_castaneum/ncbi_dataset/data/GCF_000002335.3/cds_from_genomic.fna Tcas.fasta
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$ ls
Agra.fasta  Cass.fasta  Dpon.fasta  Smad.fasta  Sory.fasta  Tcas.fasta
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$
  • 3.ch.hed.pyを実行
  • 4.edit.pyを実行
  • 5.new_makefna.pyを実行
  • 6.mafft.shをqsubで実行
  • 7.fix.pyを実行

0206

PAMLの実行

昨日までの前処理がうまくいき、/home/kosukesano/tools/for_paml/data/250205_6sp/SCO_CDSディレクトリに*_maffted_fixed.fnaファイルができた。これを使ってPAMLのBranch-Siteモデルにかける。

まず~/tools/for_paml/250206_6spディレクトリを作成し、その下にbsAbs_nullディレクトリを新たに作成。

kosukesano@at139:~/tools/for_paml/250206_6sp$ ls
bsA  bs_null
kosukesano@at139:~/tools/for_paml/250206_6sp$ 

bsAについて

~/tools/for_paml/250206_6sp/bsAディレクトリ直下でbsA_paml.shtemplate.ctlを作成、bsA_paml.shqsubで投げた。

### bsA_paml.sh

#$ -S /bin/bash
#$ -cwd
#$ -l gpu

# ディレクトリの設定
input_dir="/home/kosukesano/tools/for_paml/data/250205_6sp/SCO_CDS"
bsA_dir="/home/kosukesano/tools/for_paml/250206_6sp/bsA"
result_dir="$bsA_dir/result"
template_ctl="$bsA_dir/template.ctl"

# 出力ディレクトリが存在しない場合は作成
mkdir -p "$result_dir"

# テンプレートの制御ファイルを読み込む
ctl_template=$(cat "$template_ctl")

# ディレクトリ内の_maffted_fixed.fastaファイルを処理
for file in "$input_dir"/*_maffted_fixed.fna; do
  if [[ -f "$file" ]]; then
    base_name=$(basename "$file" .fna)
    outfile_path="$result_dir/${base_name}_branch_alt"

    # 一時的な制御ファイルの内容を生成
    ctl_content="${ctl_template//<SEQFILE>/$file}"
    ctl_content="${ctl_content//<OUTFILE>/$outfile_path}"

    # 一時的な制御ファイルを作成
    ctl_path="$bsA_dir/bsA.ctl"
    echo "$ctl_content" > "$ctl_path"

    # PAMLを実行
    singularity exec -e /usr/local/biotools/p/paml:4.9--h779adbc_6 codeml "$ctl_path"

    echo "Processed file: $file, output: $outfile_path"
  fi
done
### template.ctl

seqfile = <SEQFILE>
treefile = /home/kosukesano/tools/for_paml/data/250205_6sp/out.tre
outfile = <OUTFILE>

noisy = 9
verbose = 1
runmode = 0
seqtype = 1
CodonFreq = 2
clock = 0
model = 2
NSsites = 2
fix_omega = 0
omega = 1
icode = 0
fix_kappa = 0
kappa = 2
fix_alpha = 1
alpha = .0
Malpha = 0
ncatG = 4
getSE = 0
RateAncestor = 0
method = 0
fix_blength = 0

インプットのツリーファイルは~/tools/for_ASTRAL/Astral/250203_6sp/ASTRAL出力のものをコピーした。

bs_nullについて

同じくファイルを作成、qsubで投げた。

#$ -S /bin/bash
#$ -cwd
#$ -l gpu

# ディレクトリの設定
input_dir="/home/kosukesano/tools/for_paml/data/250205_6sp/SCO_CDS"
bsA_dir="/home/kosukesano/tools/for_paml/250206_6sp/bs_null"
result_dir="$bsA_dir/result"
template_ctl="$bsA_dir/bsN_template.ctl"

# 出力ディレクトリが存在しない場合は作成
mkdir -p "$result_dir"

# テンプレートの制御ファイルを読み込む
ctl_template=$(cat "$template_ctl")

# ディレクトリ内の_maffted_fixed.fastaファイルを処理
for file in "$input_dir"/*_maffted_fixed.fna; do
  if [[ -f "$file" ]]; then
    base_name=$(basename "$file" .fna)
    outfile_path="$result_dir/${base_name}_branch_alt"

    # 一時的な制御ファイルの内容を生成
    ctl_content="${ctl_template//<SEQFILE>/$file}"
    ctl_content="${ctl_content//<OUTFILE>/$outfile_path}"

    # 一時的な制御ファイルを作成
    ctl_path="$bsA_dir/bsA.ctl"
    echo "$ctl_content" > "$ctl_path"

    # PAMLを実行
    singularity exec -e /usr/local/biotools/p/paml:4.9--h779adbc_6 codeml "$ctl_path"

    echo "Processed file: $file, output: $outfile_path"
  fi
done
seqfile = <SEQFILE>
treefile = /home/kosukesano/tools/for_paml/data/250205_6sp/out.tre
outfile = <OUTFILE>

noisy = 9
verbose = 1
runmode = 0
seqtype = 1
CodonFreq = 2
clock = 0
model = 2
NSsites = 2
fix_omega = 1
omega = 1
icode = 0
fix_kappa = 0
kappa = 2
fix_alpha = 1
alpha = .0
Malpha = 0
ncatG = 4
getSE = 0
RateAncestor = 0
method = 0
fix_blength = 0

0207

trimalによるCDSのトリミング

なんかPAMLうまく解析出来てないんだよなあと思ったらトリミングできてなかった。MAFFTをかけたファイルについて、ギャップの部分をトリミングしておく。

#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 6

source ~/tools/pyenv_env/ManualPhilo_profile

# ディレクトリパス
input_dir="/home/kosukesano/tools/for_paml/data/250205_6sp/SCO_CDS/"
output_dir="/home/kosukesano/tools/for_paml/data/250205_6sp/SCO_CDS/"

# 各ファイルに対してアラインメントを実行
for file in "$input_dir"*_maffted_fixed.fna; do
  # 元のファイル名から拡張子を除いたものを取得
  base_name=$(basename "$file" _maffted_fixed.fna)

  # 出力ファイル名を生成
  output_file="${output_dir}${base_name}_trimed.fna"
  output_html="${output_dir}${base_name}_trimed.html"

  # MAFFTを実行
  mafft --auto --maxiterate 1000 --localpair "$file" > "$output_file"
  trimal -in "$file" -out "$output_file" -htmlout "$output_html" -gt 0.9 -cons 60


  echo "trimed file created: $output_file"
done

これをqsubで投げた。-gt 0.9-gt 1とかにした方がいいかも。

PAML続き

ツリーをout.log.treeに変更

(Cass,(Smad#1,(Dpon,(Agra,(Sory,Tcas)))));

0210

PAML続き

### bs_lrp.py

import os
import re
from scipy.stats import chi2

def parse_lnL(file_path):
    with open(file_path, 'r') as f:
        for line in f:
            match = re.search(r'lnL\(ntime: \d+  np: (\d+)\):\s+(-?\d+\.\d+)', line)
            if match:
                np = int(match.group(1))
                lnL = float(match.group(2))
                return np, lnL
    return None, None

def perform_lrt(alt_lnL, alt_np, null_lnL, null_np):
    lr_stat = 2 * (alt_lnL - null_lnL)
    df = alt_np - null_np
    p_val = chi2.sf(lr_stat, df)
    return p_val

def main():
    alt_dir = '/home/kosukesano/tools/for_paml/250207_6sp/bsA/result'
    null_dir = '/home/kosukesano/tools/for_paml/250207_6sp/bs_null/result'
    output_file = 'branch_site_lrt_results.txt'

    alt_dir = os.path.expanduser(alt_dir)
    null_dir = os.path.expanduser(null_dir)

    og_files = [f for f in os.listdir(alt_dir) if '_trimed_branch_alt' in f]

    with open(output_file, 'w') as out_f:
        out_f.write('OG_num\tp_val\tpositive_selection\n')

        for og_file in og_files:
            og_num = og_file.split('_')[0]
            alt_file = os.path.join(alt_dir, og_file)
            null_file = os.path.join(null_dir, og_file)

            if os.path.exists(null_file):
                alt_np, alt_lnL = parse_lnL(alt_file)
                null_np, null_lnL = parse_lnL(null_file)

                if alt_np is not None and null_np is not None:
                    p_val = perform_lrt(alt_lnL, alt_np, null_lnL, null_np)
                    reject_null = '+' if p_val < 0.05 else '-'
                    out_f.write(f'{og_num}\t{p_val}\t{reject_null}\n')

if __name__ == "__main__":
    main()

結果はこう

kosukesano@at138:~/tools/for_paml/250207_6sp$ grep -io + branch_site_lrt_results.txt | wc -l
188
kosukesano@at138:~/tools/for_paml/250207_6sp$ 

188遺伝子で正の選択を検出!

マダラの新しいゲノムに機能アノテーションを移植する

以下のスクリプトを実行した。

#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 12

echo "pwd: $(pwd)"
echo HOME: $HOME
echo USER: $USER
echo JOB_ID: $JOB_ID

echo starting at
date

#BLASTの標準列名を定義

header="qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore"

#出力ファイルの定義

output_file="/home/kosukesano/reference_sequence/250210_Fnc_anno/out_madara_blastp_test.txt"

#列名を出力ファイルに書き込む

echo "$header" > $output_file

#BLASTpを実行し、結果を追加する

singularity exec --bind /usr/local/seq /usr/local/biotools/b/blast:2.9.0--pl526h979a64d_3 blastp \
-query /home/kosukesano/reference_sequence/250210_Fnc_anno/241129_madara_iso1.aa \
-db /home/kosukesano/reference_sequence/Sory_Tcas_Dmel_Ecol_ref/merge_4sp \
-evalue 1e-04 \
-outfmt 6 >> $output_file

echo ending at
date

tRNA関連遺伝子の抽出

テスト

kosukesano@at138:~/tools/for_paml/250210_tRNA/blast_test$ nano e2.fasta
kosukesano@at138:~/tools/for_paml/250210_tRNA/blast_test$ cp ~/tools/for_braker/241129_madara/241129_madara_iso1.aa ../blast_test/
kosukesano@at138:~/tools/for_paml/250210_tRNA/blast_test$ ls
241129_madara_iso1.aa  e2.fasta
kosukesano@at138:~/tools/for_paml/250210_tRNA/blast_test$ singularity exec -e /usr/local/biotools/b/blast:2.9.0--pl526h979a64d_3 makeblastdb -in 241129_madara_iso1.aa -out madara -dbtype prot -
hash_index
WARNING: Skipping mount /opt/pkg/singularity-ce/4.0.0/var/singularity/mnt/session/etc/resolv.conf [files]: /etc/resolv.conf doesn't exist in container


Building a new DB, current time: 02/10/2025 15:09:24
New DB name:   /home/kosukesano/tools/for_paml/250210_tRNA/blast_test/madara
New DB title:  241129_madara_iso1.aa
Sequence type: Protein
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 12337 sequences in 0.402422 seconds.
kosukesano@at138:~/tools/for_paml/250210_tRNA/blast_test$ ls
241129_madara_iso1.aa  e2.fasta  madara.phd  madara.phi  madara.phr  madara.pin  madara.pog  madara.psd  madara.psi  madara.psq
kosukesano@at138:~/tools/for_paml/250210_tRNA/blast_test$

0211

tRNA修飾とCK合成に関わる遺伝子の検出

先行研究に記載のあった遺伝子e1~e17について、NCBIのタンパク質IDを元にアミノ酸配列を取得。これをクエリーにしてマダラのゲノムにblastp検索をかけた。

#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 6

echo "pwd: $(pwd)"
echo HOME: $HOME
echo USER: $USER
echo JOB_ID: $JOB_ID

echo starting at
date

#BLASTの標準列名を定義

header="qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore"

#出力ファイルの定義

output_file="/home/kosukesano/tools/for_paml/250210_tRNA/blast_test/tRNAgene_out.txt"

#列名を出力ファイルに書き込む

echo "$header" > $output_file

#BLASTpを実行し、結果を追加する

singularity exec --bind /usr/local/seq /usr/local/biotools/b/blast:2.9.0--pl526h979a64d_3 blastp \
-query /home/kosukesano/tools/for_paml/250210_tRNA/blast_test/tRNAgene.fasta  \
-db /home/kosukesano/tools/for_paml/250210_tRNA/blast_test/madara \
-evalue 1e-04 \
-outfmt 6 >> $output_file

echo ending at
date

結果は以下の通り。

qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore
e1_1    g6098.t1        26.829  123     77      4       1       123     5       114     8.69e-06        45.4
e1_2    g6098.t1        36.792  106     66      1       67      172     7       111     9.27e-22        94.7
e2      g6098.t1        44.875  439     207     5       1       434     1       409     8.39e-115       343
e3      g1799.t1        34.559  408     236     4       2       380     106     511     1.61e-74        243
e3      g6011.t1        27.494  451     277     13      4       433     67      488     3.28e-40        150
e5      g8991.t1        60.000  580     227     3       216     791     67      645     0.0     756
e5      g2065.t1        26.892  502     301     17      213     655     35      529     5.10e-32        130
e8      g6207.t1        53.846  286     124     2       88      367     43      326     2.62e-112       330
e8      g6208.t1        50.877  285     138     1       83      367     20      302     1.85e-103       306
e8      g405.t1 23.904  251     165     11      125     364     15      250     1.28e-07        51.2
e9      g1876.t1        55.977  343     150     1       2       343     3       345     2.56e-147       418
e9      g3375.t1        33.537  328     209     5       14      336     11      334     2.15e-51        172
e10     g11508.t1       52.247  178     85      0       5       182     10      187     3.82e-56        174
e12     g4172.t1        26.136  176     93      7       264     430     206     353     1.62e-08        55.5
e12     g6393.t1        28.099  121     63      3       264     381     291     390     2.05e-08        55.1
e12     g396.t1 30.000  110     68      3       328     437     340     440     4.85e-08        53.9
.
.
.
.
.

先行研究では昆虫のゲノムで検出されなかったとしているe1やe12~のタンパク質もヒットしているが、bitscoreやevalueの値が悪そう。

とりあえず機能やオーソログ情報を紐付けてみよう。

# A tibble: 68 × 11
   qseqid sseqid      evalue Sory_GeneFunction    Orthogroup Agra_iso1 Cass_iso1
   <chr>  <chr>        <dbl> <chr>                <chr>      <chr>     <chr>    
 1 e1_1   g6098.t1 8.69e-  6 tRNA dimethylallylt… OG0003486  Agra_P_0… Cass_AG9…
 2 e1_2   g6098.t1 9.27e- 22 tRNA dimethylallylt… OG0003486  Agra_P_0… Cass_AG9…
 3 e2     g6098.t1 8.39e-115 tRNA dimethylallylt… OG0003486  Agra_P_0… Cass_AG9…
 4 e3     g1799.t1 1.61e- 74 CDK5RAP1-like prote… OG0008328  Agra_P_0… Cass_AG9…
 5 e3     g6011.t1 3.28e- 40 threonylcarbamoylad… OG0005024  Agra_P_0… Cass_AG9…
 6 e5     g8991.t1 0         protein 5NUC         OG0004173  Agra_P_0… Cass_AH1…
 7 e5     g2065.t1 5.10e- 32 protein 5NUC-like, … OG0001251  Agra_P_0… Cass_AG9…
 8 e8     g6207.t1 2.62e-112 purine nucleoside p… OG0000755  Agra_P_0… Cass_AG9…
 9 e8     g6208.t1 1.85e-103 purine nucleoside p… OG0000755  Agra_P_0… Cass_AG9…
10 e8     g405.t1  1.28e-  7 S-methyl-5'-thioade… OG0000587  Agra_P_0… Cass_AG9…
# ℹ 58 more rows
# ℹ 4 more variables: Dpon_iso1 <chr>, Smad_iso1 <chr>, Sory_iso1 <chr>,
#   Tcas_iso1 <chr>

0212

PRANKによるコドンアライメント

PAMLの結果がうまく出なかった。どうもコドンベースでアライメントされておらず、3の倍数になってないものがあるみたい。

PRANKを使用しコドンアライメントを行う。

### ~/tools/for_paml/data/250211_6sp/prank.sh

#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 6



# ディレクトリパス
input_dir="/home/kosukesano/tools/for_paml/data/250211_6sp/PRANK_CDS/"
output_dir="/home/kosukesano/tools/for_paml/data/250211_6sp/PRANK_CDS/"

# 各ファイルに対してアラインメントを実行
for file in "$input_dir"*.fna; do
  # 元のファイル名から拡張子を除いたものを取得
  base_name=$(basename "$file" .fna)

  # 出力ファイル名を生成
  output_file="${output_dir}${base_name}_pranked.fna"

  # prankを実行
  singularity exec -e /usr/local/biotools/p/prank\:v.170427--h9f5acd7_6 prank -d="$file" -o="$output_file" -codon -F

  echo "Aligned file created: $output_file"
done

CK生合成に関わる遺伝子のPAML

~/tools/for_paml/250210_tRNAを作成、この下のnama_data/OrthoFinder出力のオーソログ.fastaファイルと遺伝子系統樹のファイルを置いた。

kosukesano@at139:~/tools/for_paml/250210_tRNA$ ls nama_data/
OG0000120.fa        OG0000572_tree.txt  OG0000755.fa        OG0001494_tree.txt  OG0003982.fa        OG0005024_tree.txt  OG0008328.fa        OG0010036_tree.txt
OG0000120_tree.txt  OG0000584.fa        OG0000755_tree.txt  OG0003285.fa        OG0003982_tree.txt  OG0006816.fa        OG0008328_tree.txt
OG0000203.fa        OG0000584_tree.txt  OG0001251.fa        OG0003285_tree.txt  OG0004173.fa        OG0006816_tree.txt  OG0009811.fa
OG0000203_tree.txt  OG0000587.fa        OG0001251_tree.txt  OG0003486.fa        OG0004173_tree.txt  OG0008087.fa        OG0009811_tree.txt
OG0000572.fa        OG0000587_tree.txt  OG0001494.fa        OG0003486_tree.txt  OG0005024.fa        OG0008087_tree.txt  OG0010036.fa
kosukesano@at139:~/tools/for_paml/250210_tRNA$

~/tools/for_paml/250210_tRNAExOG.pyを実行した。

### ~/tools/for_paml/250210_tRNA/ExOG.py

# ファイルパスの設定
orthogroups_file_path = '/home/kosukesano/tools/for_orthofinder/250128_6sp_iso1/Change_hedder/OrthoFinder/Results_Jan28/Orthogroups/Orthogroups.txt'
single_copy_orthologues_file_path = '/home/kosukesano/tools/for_paml/250210_tRNA/CK_OG_no.txt'
output_file_path = '/home/kosukesano/tools/for_paml/250210_tRNA/extracted_orthogroups.txt'

# シングルコピーオルソログのIDをセットに格納
single_copy_orthologues = set()
with open(single_copy_orthologues_file_path, 'r') as single_copy_file:
    for line in single_copy_file:
        single_copy_orthologues.add(line.strip())

# Orthogroups.txt から該当する行を抽出して新しいファイルに保存
with open(orthogroups_file_path, 'r') as orthogroups_file, open(output_file_path, 'w') as output_file:
    for line in orthogroups_file:
        # 行の最初の部分を取り出してIDをチェック
        og_id = line.split(':')[0].strip()
        if og_id in single_copy_orthologues:
            output_file.write(line)

続いて、makefna.pyを実行した。

### ~/tools/for_paml/250210_tRNA/makefna.pyの中身

import os

# ファイルパスの設定
orthogroups_file = "/home/kosukesano/tools/for_paml/250210_tRNA/extracted_orthogroups.txt"
input_dir = "/home/kosukesano/tools/for_paml/data/250205_6sp/changehedder/kansei/"
output_dir = "/home/kosukesano/tools/for_paml/250210_tRNA/SCO_CDS/"

# ディレクトリが存在しない場合、作成
os.makedirs(output_dir, exist_ok=True)

# 種名とプレフィックスの対応辞書
species_prefix = {
    "Agra": "Agra",
    "Cass": "Cass",
    "Dpon": "Dpon",
    "Smad": "Smad",
    "Sory": "Sory",
    "Tcas": "Tcas"
}

# 各種のFASTAファイルを辞書に格納
fasta_files = {species: os.path.join(input_dir, f"{species}_changehedder.fasta") for species in species_prefix}

# OG番号と遺伝子IDをextracted_orthogroups.txtから取得
with open(orthogroups_file, "r") as ortho_f:
    for line in ortho_f:
        if line.strip():  # 空行を無視
            # 行をOG番号と遺伝子IDリストに分割
            og_number, gene_ids_str = line.split(":")
            og_number = og_number.strip()
            gene_ids = gene_ids_str.strip().split()

            # 出力ファイルのパス
            output_file = os.path.join(output_dir, f"{og_number}.fna")

            # 出力ファイルを開く
            with open(output_file, "w") as out_f:
                for gene_id in gene_ids:
                    species = None
                    prefix = gene_id[:4]
                    
                    # 種類の判定
                    for key, sp in species_prefix.items():
                        if prefix.startswith(sp):
                            species = sp
                            break
                    
                    if species and species in fasta_files:
                        fasta_file = fasta_files[species]
                        with open(fasta_file, "r") as fasta_f:
                            write_flag = False
                            for fasta_line in fasta_f:
                                if fasta_line.startswith(f">{gene_id}"):
                                    out_f.write(fasta_line)
                                    print(fasta_line.strip())
                                    write_flag = True
                                elif fasta_line.startswith(">") and write_flag:
                                    write_flag = False
                                elif write_flag:
                                    out_f.write(fasta_line)
                                    print(fasta_line.strip())
            print(f"{og_number}.fna ファイルが {output_dir} に保存されました。")

MCOにも対応した最新使用。

これで出力されたファイルに対してPRANKでアライメントを行う

#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 6



# ディレクトリパス
input_dir="/home/kosukesano/tools/for_paml/250210_tRNA/SCO_CDS/"
output_dir="/home/kosukesano/tools/for_paml/250210_tRNA/PRANK_CDS/"


# 各ファイルに対してアラインメントを実行
for file in "$input_dir"*.fna; do
  # 元のファイル名から拡張子を除いたものを取得
  base_name=$(basename "$file" .fna)

  # 出力ファイル名を生成
  output_file="${output_dir}${base_name}_pranked.fna"

  # prankを実行
  singularity exec -e /usr/local/biotools/p/prank\:v.170427--h9f5acd7_6 prank -d="$file" -o="$output_file" -codon -F

  echo "Aligned file created: $output_file"
done

また、OrthoFinder出力の遺伝子系統樹には種名の「Tcas_iso1_」のような接頭辞が葉のラベルにくっついていた。これを切り取る。

### ~/tools/for_paml/250210_tRNA/tree_edit.py

import os
import glob

# 処理対象のディレクトリ
directory = "/home/kosukesano/tools/for_paml/250210_tRNA/nama_data"

# 削除する文字列のリスト
remove_strings = ["Tcas_iso1_", "Smad_iso1_", "Dpon_iso1_", 
                  "Cass_iso1_", "Agra_iso1_", "Sory_iso1_"]

# _tree.txt で終わるファイルを取得
tree_files = glob.glob(os.path.join(directory, "*_tree.txt"))

# 各ファイルを処理
for file_path in tree_files:
    with open(file_path, "r") as f:
        content = f.read()

    # 指定の文字列をすべて除去
    for remove_str in remove_strings:
        content = content.replace(remove_str, "")

    # ファイルを上書き保存
    with open(file_path, "w") as f:
        f.write(content)

    print(f"Processed: {file_path}")

print("すべてのファイルを処理しました。")

3の倍数になるようにNをつける

PRANKは入力に使う配列が3の倍数じゃないとエラー起こしちゃうっぽい。

これを防ぐには、配列の最後にNを加えて3の倍数にしてあげればいい。

### ~/tools/for_paml/250210_tRNA/plusN.py


from pathlib import Path

def adjust_fasta_length(input_file, output_file):
    """FASTAファイルの配列を3の倍数に調整し、Nを追加する"""
    with open(input_file, "r") as infile, open(output_file, "w") as outfile:
        header = ""
        sequence = ""

        for line in infile:
            line = line.strip()
            if line.startswith(">"):  # ヘッダー行
                if header and sequence:  # 既存のデータがある場合は処理
                    remainder = len(sequence) % 3
                    if remainder != 0:
                        sequence += "N" * (3 - remainder)  # N を追加
                    outfile.write(header + "\n" + sequence + "\n")

                header = line  # 新しいヘッダーを保存
                sequence = ""  # 新しいシーケンスをリセット
            else:
                sequence += line  # 配列データを連結

        # 最後のシーケンスを処理
        if header and sequence:
            remainder = len(sequence) % 3
            if remainder != 0:
                sequence += "N" * (3 - remainder)
            outfile.write(header + "\n" + sequence + "\n")

def process_all_fasta_files(input_dir, output_dir):
    """input_dir 内のすべての .fna ファイルを処理し、output_dir に保存"""
    input_path = Path(input_dir)
    output_path = Path(output_dir)

    # 出力ディレクトリが存在しない場合は作成
    output_path.mkdir(parents=True, exist_ok=True)

    # .fna ファイルを処理
    for fasta_file in input_path.glob("*.fna"):
        output_file = output_path / fasta_file.name  # 出力ファイルのパス
        adjust_fasta_length(fasta_file, output_file)
        print(f"処理完了: {output_file}")

# 実行設定
input_directory = "/home/kosukesano/tools/for_paml/250210_tRNA/SCO_CDS"
output_directory = "/home/kosukesano/tools/for_paml/250210_tRNA/SCO_CDS_plusN"

process_all_fasta_files(input_directory, output_directory)
print("全ての処理が完了しました。")

0213

3の倍数になるようにNをつけた後のPRANK結果

kosukesano@at138:~/tools/for_paml/250210_tRNA$ ls PRANK_CDS_plusN/
OG0000120_pranked.fna.best.fas  OG0000755_pranked.fna.best.fas  OG0003982_pranked.fna.best.fas  OG0008328_pranked.fna.best.fas
OG0000203_pranked.fna.best.fas  OG0001251_pranked.fna.best.fas  OG0004173_pranked.fna.best.fas  OG0009811_pranked.fna.best.fas
OG0000572_pranked.fna.best.fas  OG0001494_pranked.fna.best.fas  OG0005024_pranked.fna.best.fas  OG0010036_pranked.fna.best.fas
OG0000584_pranked.fna.best.fas  OG0003285_pranked.fna.best.fas  OG0006816_pranked.fna.best.fas
OG0000587_pranked.fna.best.fas  OG0003486_pranked.fna.best.fas  OG0008087_pranked.fna.best.fas
kosukesano@at138:~/tools/for_paml/250210_tRNA$ 

できてる!

CK合成に関わるオーソログの系統樹に#1をつける

OG0000120

# A tibble: 8 × 5
  qseqid sseqid   Orthogroup   evalue Sory_GeneFunction                         
  <chr>  <chr>    <chr>         <dbl> <chr>                                     
1 e12    g6393.t1 OG0000120  2.05e- 8 UDP-glucuronosyltransferase 2C1-like isof…
2 e12    g6392.t1 OG0000120  1.99e- 7 UDP-glucuronosyltransferase 2C1-like isof…
3 e14    g6393.t1 OG0000120  1.32e-11 UDP-glucuronosyltransferase 2C1-like isof…
4 e14    g6392.t1 OG0000120  2.97e-11 UDP-glucuronosyltransferase 2C1-like isof…
5 e15    g6393.t1 OG0000120  8.32e- 7 UDP-glucuronosyltransferase 2C1-like isof…
6 e15    g6392.t1 OG0000120  2.13e- 6 UDP-glucuronosyltransferase 2C1-like isof…
7 e16    g6392.t1 OG0000120  9.32e- 8 UDP-glucuronosyltransferase 2C1-like isof…
8 e16    g6393.t1 OG0000120  2.19e- 7 UDP-glucuronosyltransferase 2C1-like isof…

(g6392, g6393)に#1を振った。

OG0000203

# A tibble: 7 × 5
  qseqid sseqid   Orthogroup     evalue Sory_GeneFunction                   
  <chr>  <chr>    <chr>           <dbl> <chr>                               
1 e12    g9319.t1 OG0000203  0.00000438 UDP-glucuronosyltransferase 1-9-like
2 e12    g9322.t1 OG0000203  0.00000451 UDP-glucuronosyltransferase 1-9-like
3 e12    g9321.t1 OG0000203  0.0000226  UDP-glucuronosyltransferase 1-9-like
4 e14    g9321.t1 OG0000203  0.00000126 UDP-glucuronosyltransferase 1-9-like
5 e14    g9319.t1 OG0000203  0.00000275 UDP-glucuronosyltransferase 1-9-like
6 e14    g9322.t1 OG0000203  0.0000028  UDP-glucuronosyltransferase 1-9-like
7 e16    g9319.t1 OG0000203  0.0000215  UDP-glucuronosyltransferase 1-9-like

(g9321, g9322)、g9319に#1を振った。別でg9323(アノテーションのつかなかったほう)だけの解析をしてもいいかも。

OG0000572

# A tibble: 8 × 5
  qseqid sseqid   Orthogroup   evalue Sory_GeneFunction                    
  <chr>  <chr>    <chr>         <dbl> <chr>                                
1 e12    g6542.t1 OG0000572  1.13e- 7 UDP-glucuronosyltransferase 2B13-like
2 e12    g6541.t1 OG0000572  5.47e- 6 UDP-glucuronosyltransferase 2B13-like
3 e13    g6542.t1 OG0000572  9.55e- 6 UDP-glucuronosyltransferase 2B13-like
4 e14    g6542.t1 OG0000572  4.42e-11 UDP-glucuronosyltransferase 2B13-like
5 e14    g6541.t1 OG0000572  7.08e-10 UDP-glucuronosyltransferase 2B13-like
6 e15    g6542.t1 OG0000572  6.22e- 8 UDP-glucuronosyltransferase 2B13-like
7 e16    g6542.t1 OG0000572  4.22e-10 UDP-glucuronosyltransferase 2B13-like
8 e16    g6541.t1 OG0000572  7.21e- 8 UDP-glucuronosyltransferase 2B13-like

g6541に#1を振った。

OG0000584

# A tibble: 10 × 5
   qseqid sseqid   Orthogroup       evalue Sory_GeneFunction                    
   <chr>  <chr>    <chr>             <dbl> <chr>                                
 1 e12    g396.t1  OG0000584  0.0000000485 2-hydroxyacylsphingosine 1-beta-gala…
 2 e12    g8427.t1 OG0000584  0.000000177  2-hydroxyacylsphingosine 1-beta-gala…
 3 e13    g8427.t1 OG0000584  0.00000362   2-hydroxyacylsphingosine 1-beta-gala…
 4 e13    g396.t1  OG0000584  0.00000391   2-hydroxyacylsphingosine 1-beta-gala…
 5 e14    g396.t1  OG0000584  0.0000000649 2-hydroxyacylsphingosine 1-beta-gala…
 6 e14    g8427.t1 OG0000584  0.000000872  2-hydroxyacylsphingosine 1-beta-gala…
 7 e15    g396.t1  OG0000584  0.000000156  2-hydroxyacylsphingosine 1-beta-gala…
 8 e15    g8427.t1 OG0000584  0.000000271  2-hydroxyacylsphingosine 1-beta-gala…
 9 e16    g396.t1  OG0000584  0.000000283  2-hydroxyacylsphingosine 1-beta-gala…
10 e16    g8427.t1 OG0000584  0.000000403  2-hydroxyacylsphingosine 1-beta-gala…

g8427, g396に#1を振った

OG0000587

# A tibble: 1 × 5
  qseqid sseqid  Orthogroup      evalue Sory_GeneFunction                       
  <chr>  <chr>   <chr>            <dbl> <chr>                                   
1 e8     g405.t1 OG0000587  0.000000128 S-methyl-5'-thioadenosine phosphorylase…

g405に#1を振った。別でg6550, g6551(アノテーションのつかなかったほう)だけの解析をしてもいいかも。

OG0000755

# A tibble: 2 × 5
  qseqid sseqid   Orthogroup    evalue Sory_GeneFunction                        
  <chr>  <chr>    <chr>          <dbl> <chr>                                    
1 e8     g6207.t1 OG0000755  2.62e-112 purine nucleoside phosphorylase-like iso…
2 e8     g6208.t1 OG0000755  1.85e-103 purine nucleoside phosphorylase-like iso…

g6027, g6028に#1を振った

OG0001251

# A tibble: 1 × 5
  qseqid sseqid   Orthogroup   evalue Sory_GeneFunction         
  <chr>  <chr>    <chr>         <dbl> <chr>                     
1 e5     g2065.t1 OG0001251  5.10e-32 protein 5NUC-like, partial

g2065に#1を振った

*OG0001494

# A tibble: 1 × 5
  qseqid sseqid   Orthogroup    evalue Sory_GeneFunction
  <chr>  <chr>    <chr>          <dbl> <chr>            
1 e9     g1876.t1 OG0001494  2.56e-147 adenosine kinase 

g1876に#1を振った

OG0003285

# A tibble: 16 × 5
   qseqid sseqid   Orthogroup   evalue Sory_GeneFunction                    
   <chr>  <chr>    <chr>         <dbl> <chr>                                
 1 e12    g4172.t1 OG0003285  1.62e- 8 UDP-glucuronosyltransferase 2B15-like
 2 e12    g4169.t1 OG0003285  8.65e- 6 UDP-glucuronosyltransferase 2B15-like
 3 e12    g6414.t2 OG0003285  1.06e- 5 UDP-glucuronosyltransferase 2B15-like
 4 e13    g4172.t1 OG0003285  4.40e- 6 UDP-glucuronosyltransferase 2B15-like
 5 e14    g4172.t1 OG0003285  2.02e-10 UDP-glucuronosyltransferase 2B15-like
 6 e14    g4170.t1 OG0003285  3.56e- 8 UDP-glucuronosyltransferase 2B15-like
 7 e14    g4171.t1 OG0003285  4.50e- 8 UDP-glucuronosyltransferase 2B15-like
 8 e14    g6414.t2 OG0003285  8.37e- 8 UDP-glucuronosyltransferase 2B15-like
 9 e14    g4169.t1 OG0003285  1.88e- 7 UDP-glucuronosyltransferase 2B15-like
10 e15    g4172.t1 OG0003285  5.09e- 8 UDP-glucuronosyltransferase 2B15-like
11 e15    g4169.t1 OG0003285  5.08e- 6 UDP-glucuronosyltransferase 2B15-like
12 e16    g4172.t1 OG0003285  2.64e- 8 UDP-glucuronosyltransferase 2B15-like
13 e16    g4170.t1 OG0003285  5.03e- 8 UDP-glucuronosyltransferase 2B15-like
14 e16    g4171.t1 OG0003285  1.32e- 7 UDP-glucuronosyltransferase 2B15-like
15 e16    g4169.t1 OG0003285  8.37e- 7 UDP-glucuronosyltransferase 2B15-like
16 e16    g6414.t2 OG0003285  6.89e- 6 UDP-glucuronosyltransferase 2B15-like

マダラ枝基部に#1を振った

OG0003486

# A tibble: 3 × 5
  qseqid sseqid   Orthogroup    evalue Sory_GeneFunction                       
  <chr>  <chr>    <chr>          <dbl> <chr>                                   
1 e1_1   g6098.t1 OG0003486  8.69e-  6 tRNA dimethylallyltransferase isoform X1
2 e1_2   g6098.t1 OG0003486  9.27e- 22 tRNA dimethylallyltransferase isoform X1
3 e2     g6098.t1 OG0003486  8.39e-115 tRNA dimethylallyltransferase isoform X1

g6098に#1を振った

OG003982

# A tibble: 2 × 5
  qseqid sseqid   Orthogroup        evalue Sory_GeneFunction                    
  <chr>  <chr>    <chr>              <dbl> <chr>                                
1 e12    g1104.t1 OG0003982  0.00000616    2-hydroxyacylsphingosine 1-beta-gala…
2 e14    g1104.t1 OG0003982  0.00000000168 2-hydroxyacylsphingosine 1-beta-gala…

g1104に#1を振った

OG0004173

# A tibble: 1 × 5
  qseqid sseqid   Orthogroup evalue Sory_GeneFunction
  <chr>  <chr>    <chr>       <dbl> <chr>            
1 e5     g8991.t1 OG0004173       0 protein 5NUC     

g8991に#1を振った

OG0005024

# A tibble: 1 × 5
  qseqid sseqid   Orthogroup   evalue Sory_GeneFunction                         
  <chr>  <chr>    <chr>         <dbl> <chr>                                     
1 e3     g6011.t1 OG0005024  3.28e-40 threonylcarbamoyladenosine tRNA methylthi…

g6011に#1を振った

OG0006816

# A tibble: 1 × 5
  qseqid sseqid   Orthogroup       evalue Sory_GeneFunction              
  <chr>  <chr>    <chr>             <dbl> <chr>                          
1 e17    g5362.t1 OG0006816  0.0000000268 delta(24)-sterol reductase-like

g5362に#1を振った

OG0008087

# A tibble: 3 × 5
  qseqid sseqid    Orthogroup       evalue Sory_GeneFunction                   
  <chr>  <chr>     <chr>             <dbl> <chr>                               
1 e13    g11571.t1 OG0008087  0.0000485    UDP-glucuronosyltransferase 2C1-like
2 e14    g11571.t1 OG0008087  0.0000119    UDP-glucuronosyltransferase 2C1-like
3 e16    g11571.t1 OG0008087  0.0000000322 UDP-glucuronosyltransferase 2C1-like

g11571に#1を振った

OG0008328

# A tibble: 1 × 5
  qseqid sseqid   Orthogroup   evalue Sory_GeneFunction    
  <chr>  <chr>    <chr>         <dbl> <chr>                
1 e3     g1799.t1 OG0008328  1.61e-74 CDK5RAP1-like protein

g1799に#1を振った

OG0009811

# A tibble: 1 × 5
  qseqid sseqid   Orthogroup   evalue Sory_GeneFunction    
  <chr>  <chr>    <chr>         <dbl> <chr>                
1 e9     g3375.t1 OG0009811  2.15e-51 adenosine kinase-like

g3375に#1を振った

OG0010036

# A tibble: 1 × 5
  qseqid sseqid    Orthogroup   evalue Sory_GeneFunction                
  <chr>  <chr>     <chr>         <dbl> <chr>                            
1 e10    g11508.t1 OG0010036  3.82e-56 adenine phosphoribosyltransferase

g11508に#1を振った